Breaking timeseries data into sessions
26 October, 2014
Problem:
- You have a lot of data about users interacting with your site, with a UserID and timestamp for each transaction.
- However, you want to break this down into seperate user browsing sessions.
- You can't use any normal session variables from the logs, because they get recycled if the user logs on a second time later in the same day.
- For a simple first approximation, you want to apply the rule that if a user hasn't had any transactions for 30 minutes, they've finished their session.
- You have a lot of data and you need a set-wise SQL solution.
The problem is actually similar to the 'islands and gaps' sequence problem, which has lots of solutions online, but it's a tad harder because you can't use some of the properties of sequence data. The main trick is that it's much easier to find the big gaps between sessions than it is to find continuous sessions. So the solution below starts by finding the big gaps, and then converts these gaps to get the 'islands' of continuous activity, and finally grabs a couple of stats for the sake of demonstration.
Click permalink below to see the code!
--Test Data
CREATE TABLE #Log
( UserID int, [Timestamp] datetime)
INSERT INTO #Log
VALUES
(1, '2014-10-26 10:51:18'), (1, '2014-10-26 10:52:18'), (1, '2014-10-26 10:55:18'), (1, '2014-10-26 10:59:18'), (1, '2014-10-26 15:01:18'), (1, '2014-10-26 15:01:21'), (1, '2014-10-27 21:22:19'),
(2, '2014-10-15 12:19:01'), (2, '2014-10-15 12:19:12'), (2, '2014-10-15 12:19:45'), (2, '2014-10-15 12:20:03'), (2, '2014-10-17 14:55:13'), (2, '2014-10-17 14:55:19'),
(3, '2014-10-22 14:55:19')
--Use temp tables and indexes rather than cte's as it's going to be a lot of data.
--Get the max and min timestamp for each user
SELECT UserID, MIN([Timestamp]) AS [MinTimestamp], MAX([Timestamp]) AS [MaxTimestamp]
INTO #MinMax
FROM #Log
GROUP BY UserID
--Create a new copy of the data with the row_number field (or just add the row number in situ)
SELECT UserID, [Timestamp], row_number() OVER (PARTITION BY UserID ORDER BY [TimeStamp]) AS [ConsecutiveID]
INTO #Consec
FROM #Log
--Find all the gaps greater than 30 minutes
SELECT c1.UserID, c1.[Timestamp] AS [GapStart], c2.[Timestamp] AS [GapEnd]
INTO #SessionGaps
FROM #Consec c1
INNER JOIN #Consec c2
ON c1.UserID = c2.UserID
AND c1.ConsecutiveID = c2.ConsecutiveID - 1
AND DATEDIFF(minute, c1.[Timestamp], c2.[Timestamp]) > 30
--Union the gaps with the start and stop times for each user.
;WITH cte AS (
SELECT UserID, GapStart AS [SessionEnd], GapEnd AS [SessionStart]
FROM #SessionGaps
UNION ALL
SELECT UserID, MaxTimestamp, NULL
FROM #MinMax
UNION ALL
SELECT UserID, NULL, MinTimestamp
FROM #MinMax
)
, cte2 AS ( --now add a row number to order the 'gaps'
SELECT *, row_number() OVER (PARTITION BY UserID ORDER BY SessionEnd) AS [SessionID]
FROM cte
)
SELECT c1.UserID, --finally, join consecutive rows, taking the start and end times from the different rows
c1.SessionStart,
c2.SessionEnd,
c1.[SessionID]
INTO #Sessions
FROM cte2 c1
INNER JOIN cte2 c2
ON c1.UserID = c2.UserID
AND c1.[SessionID] = c2.[SessionID] -1
ORDER BY c1.UserID, c1.SessionStart
--Now do some stats on the sessions
SELECT s.UserID, s.SessionID, count(1) AS [TransactionCount]
INTO #SessionRowCounts
FROM #Log l
INNER JOIN #Sessions s
ON l.UserID = s.UserID
AND l.[Timestamp] BETWEEN s.SessionStart AND s.SessionEnd
GROUP BY s.UserID, s.[SessionID]
SELECT s.UserID
, s.SessionID
, SessionStart
, SessionEnd
, DATEDIFF(second, SessionStart, SessionEnd) AS [SessionLength]
, TransactionCount
FROM #Sessions s
INNER JOIN #SessionRowCounts src
ON s.UserID = src.UserID
AND s.SessionID = src.SessionID 
There are no published comments.
New comment