From 5fc68a09752e47e61b0134146ff6795c7295d131 Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Tue, 19 Jul 2016 23:17:51 +0300 Subject: make the terminology more consistent "Online sessions" instead of "online periods/streaks/durations", "time ranges" instead of "date ranges", etc. --- bin/online_duration.py | 39 ++++++------ docs/online_duration.md | 7 +-- vk/tracking/__init__.py | 4 +- vk/tracking/online_sessions.py | 133 +++++++++++++++++++++++++++++++++++++++++ vk/tracking/online_streaks.py | 133 ----------------------------------------- 5 files changed, 157 insertions(+), 159 deletions(-) create mode 100644 vk/tracking/online_sessions.py delete mode 100644 vk/tracking/online_streaks.py diff --git a/bin/online_duration.py b/bin/online_duration.py index d5b8f35..ac9251b 100644 --- a/bin/online_duration.py +++ b/bin/online_duration.py @@ -13,7 +13,7 @@ import sys import matplotlib.pyplot as plt import numpy as np -from vk.tracking import OnlineStreakEnumerator +from vk.tracking import OnlineSessionEnumerator from vk.tracking.db import Format as DatabaseFormat from vk.user import UserField @@ -23,8 +23,8 @@ class GroupBy(Enum): WEEKDAY = 'weekday' HOUR = 'hour' - def enum_durations(self, db_reader, date_from=None, date_to=None): - online_streaks = OnlineStreakEnumerator(date_from, date_to) + def group(self, db_reader, time_from=None, time_to=None): + online_streaks = OnlineSessionEnumerator(time_from, time_to) if self is GroupBy.USER: return online_streaks.group_by_user(db_reader) elif self is GroupBy.DATE: @@ -80,8 +80,8 @@ class OutputWriterCSV: raise NotImplementedError('unsupported grouping: ' + str(group_by)) return OutputWriterCSV._CONVERT_KEY[group_by](key) - def process_database(self, group_by, db_reader, date_from=None, date_to=None): - for key, duration in group_by.enum_durations(db_reader, date_from, date_to).items(): + def process_database(self, group_by, db_reader, time_from=None, time_to=None): + for key, duration in group_by.group(db_reader, time_from, time_to).items(): row = self._key_to_row(group_by, key) row.append(str(duration)) self._write_row(row) @@ -148,9 +148,9 @@ class OutputWriterJSON: self._fd.write(json.dumps(x, indent=3, ensure_ascii=False)) self._fd.write('\n') - def process_database(self, group_by, db_reader, date_from=None, date_to=None): + def process_database(self, group_by, db_reader, time_from=None, time_to=None): arr = [] - for key, duration in group_by.enum_durations(db_reader, date_from, date_to).items(): + for key, duration in group_by.group(db_reader, time_from, time_to).items(): obj = self._key_to_object(group_by, key) obj[self._DURATION_FIELD] = str(duration) arr.append(obj) @@ -300,10 +300,9 @@ class OutputWriterPlot: return tuple(map(OutputWriterPlot._duration_to_seconds, durations.values())) def process_database( - self, group_by, db_reader, date_from=None, date_to=None): + self, group_by, db_reader, time_from=None, time_to=None): - durations = group_by.enum_durations( - db_reader, date_from, date_to) + durations = group_by.group(db_reader, time_from, time_to) bar_chart = BarChartBuilder() @@ -394,7 +393,7 @@ def _parse_args(args=sys.argv): type=_parse_group_by, choices=GroupBy, default=GroupBy.USER, - help='group online streaks by user/date/etc.') + help='group online sessions by user/date/etc.') parser.add_argument('-i', '--input-format', dest='db_fmt', type=_parse_database_format, default=DatabaseFormat.CSV, @@ -405,12 +404,12 @@ def _parse_args(args=sys.argv): choices=OutputFormat, default=OutputFormat.CSV, help='specify output format') - parser.add_argument('-a', '--from', dest='date_from', + parser.add_argument('-a', '--from', dest='time_from', type=_parse_date_range_limit, default=None, - help='set the date to process database records from') - parser.add_argument('-b', '--to', dest='date_to', + help='discard online activity prior to this moment') + parser.add_argument('-b', '--to', dest='time_to', type=_parse_date_range_limit, default=None, - help='set the date to process database record to') + help='discard online activity after this moment') return parser.parse_args(args[1:]) @@ -418,16 +417,16 @@ def write_online_duration( db_fd, db_fmt=DatabaseFormat.CSV, fd=sys.stdout, fmt=OutputFormat.CSV, group_by=GroupBy.USER, - date_from=None, date_to=None): + time_from=None, time_to=None): - if date_from is not None and date_to is not None: - if date_from > date_to: - date_from, date_to = date_to, date_from + if time_from is not None and time_to is not None: + if time_from > time_to: + time_from, time_to = time_to, time_from with db_fmt.create_reader(db_fd) as db_reader: output_writer = fmt.create_writer(fd) output_writer.process_database( - group_by, db_reader, date_from=date_from, date_to=date_to) + group_by, db_reader, time_from=time_from, time_to=time_to) def main(args=sys.argv): args = _parse_args(args) diff --git a/docs/online_duration.md b/docs/online_duration.md index 81e5e48..f5c703b 100644 --- a/docs/online_duration.md +++ b/docs/online_duration.md @@ -12,7 +12,7 @@ Run from the top-level directory using `python -m`: > python -m bin.online_duration -h usage: online_duration.py [-h] [-g {user,date,weekday,hour}] [-i {csv,log,null}] [-o {csv,json,plot}] - [-a DATE_FROM] [-b DATE_TO] + [-a TIME_FROM] [-b TIME_TO] input [output] ``` @@ -141,10 +141,9 @@ a file path to write the image to. ![hour.png] You can limit the scope of the database by supplying a time range. -Only online durations that are within the supplied range shall then be -processed. +Only online sessions that overlap with this range shall then be processed. Set the range by specifying both or one of the `--from` and `--to` parameters. -Values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). +The values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). All dates and times are in UTC. diff --git a/vk/tracking/__init__.py b/vk/tracking/__init__.py index cfc80ea..0404094 100644 --- a/vk/tracking/__init__.py +++ b/vk/tracking/__init__.py @@ -2,7 +2,7 @@ # This file is licensed under the terms of the MIT License. # See LICENSE.txt for details. -from .online_streaks import OnlineStreakEnumerator, Weekday +from .online_sessions import OnlineSessionEnumerator, Weekday from .status_tracker import StatusTracker -__all__ = 'online_streaks', 'status_tracker', +__all__ = 'online_sessions', 'status_tracker', diff --git a/vk/tracking/online_sessions.py b/vk/tracking/online_sessions.py new file mode 100644 index 0000000..debf1e6 --- /dev/null +++ b/vk/tracking/online_sessions.py @@ -0,0 +1,133 @@ +# Copyright 2016 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +from collections import OrderedDict +from collections.abc import MutableMapping +from datetime import timedelta +from enum import Enum + +class Weekday(Enum): + MONDAY = 0 + TUESDAY = 1 + WEDNESDAY = 2 + THURSDAY = 3 + FRIDAY = 4 + SATURDAY = 5 + SUNDAY = 6 + + def __str__(self): + return self.name[0] + self.name[1:].lower() + +class OnlineSessionEnumerator(MutableMapping): + def __init__(self, time_from=None, time_to=None): + self._records = {} + self._time_from = time_from + self._time_to = time_to + + def __getitem__(self, user): + return self._records[user] + + def __setitem__(self, user, record): + self._records[user] = record + + def __delitem__(self, user): + del self._records[user] + + def __iter__(self): + return iter(self._records) + + def __len__(self): + return len(self._records) + + def _trim_or_drop_session(self, session): + user, started_at, ended_at = session + if self._time_from is not None: + if ended_at < self._time_from: + return None + if started_at < self._time_from: + started_at = self._time_from + if self._time_to is not None: + if started_at > self._time_to: + return None + if ended_at > self._time_to: + ended_at = self._time_to + return user, started_at, ended_at + + def read_database(self, db_reader): + for record in db_reader: + session = self._process_database_record(record) + if session is not None: + session = self._trim_or_drop_session(session) + if session is not None: + yield session + + def group_by_user(self, db_reader): + by_user = {} + for user, started_at, ended_at in self.read_database(db_reader): + if user not in by_user: + by_user[user] = timedelta() + by_user[user] += ended_at - started_at + return by_user + + def group_by_date(self, db_reader): + by_date = {} + for _, started_at, ended_at in self.read_database(db_reader): + for date, duration in self._split_into_days(started_at, ended_at): + if date not in by_date: + by_date[date] = timedelta() + by_date[date] += duration + return by_date + + def group_by_weekday(self, db_reader): + by_weekday = OrderedDict() + for weekday in Weekday: + by_weekday[weekday] = timedelta() + for _, started_at, ended_at in self.read_database(db_reader): + for date, duration in self._split_into_days(started_at, ended_at): + by_weekday[Weekday(date.weekday())] += duration + return by_weekday + + def group_by_hour(self, db_reader): + by_hour = OrderedDict() + for i in range(24): + by_hour[i] = timedelta() + for _, started_at, ended_at in self.read_database(db_reader): + for hour, duration in self._split_into_hours(started_at, ended_at): + by_hour[hour] += duration + return by_hour + + @staticmethod + def _split_into_days(a, b): + while a.date() != b.date(): + next_day = a.date() + timedelta(days=1) + yield a.date(), next_day - a + a = next_day + yield b.date(), b - a + + @staticmethod + def _split_into_hours(a, b): + while a.date() != b.date() or a.hour != b.hour: + next_hour = a.replace(minute=0, second=0) + timedelta(hours=1) + yield a.hour, next_hour - a + a = next_hour + yield b.hour, b - a + + def _process_database_record(self, record): + return self._close_user_session(record.to_user()) + + def _known_user(self, user): + return user.get_uid() in self._records + + def _unknown_user(self, user): + return not self._known_user(user) + + def _close_user_session(self, user): + if user not in self or self[user].is_offline(): + self[user] = user + return None + if user.is_online(): + return None + session = user, self[user].get_last_seen_time(), user.get_last_seen_time() + self[user] = user + return session diff --git a/vk/tracking/online_streaks.py b/vk/tracking/online_streaks.py deleted file mode 100644 index db24053..0000000 --- a/vk/tracking/online_streaks.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2016 Egor Tensin -# This file is licensed under the terms of the MIT License. -# See LICENSE.txt for details. - -from collections import OrderedDict -from collections.abc import MutableMapping -from datetime import timedelta -from enum import Enum - -class Weekday(Enum): - MONDAY = 0 - TUESDAY = 1 - WEDNESDAY = 2 - THURSDAY = 3 - FRIDAY = 4 - SATURDAY = 5 - SUNDAY = 6 - - def __str__(self): - return self.name[0] + self.name[1:].lower() - -class OnlineStreakEnumerator(MutableMapping): - def __init__(self, date_from=None, date_to=None): - self._records = {} - self._date_from = date_from - self._date_to = date_to - - def __getitem__(self, user): - return self._records[user] - - def __setitem__(self, user, record): - self._records[user] = record - - def __delitem__(self, user): - del self._records[user] - - def __iter__(self): - return iter(self._records) - - def __len__(self): - return len(self._records) - - def _cut_period(self, streak): - user, time_from, time_to = streak - if self._date_from is not None: - if time_to < self._date_from: - return None - if time_from < self._date_from: - time_from = self._date_from - if self._date_to is not None: - if time_from > self._date_to: - return None - if time_to > self._date_to: - time_to = self._date_to - return user, time_from, time_to - - def enum(self, db_reader): - for record in db_reader: - streak = self._insert_record(record) - if streak is not None: - streak = self._cut_period(streak) - if streak is not None: - yield streak - - def group_by_user(self, db_reader): - by_user = {} - for user, time_from, time_to in self.enum(db_reader): - if user not in by_user: - by_user[user] = timedelta() - by_user[user] += time_to - time_from - return by_user - - def group_by_date(self, db_reader): - by_date = OrderedDict() - for _, time_from, time_to in self.enum(db_reader): - for date, duration in self._enum_dates_and_durations(time_from, time_to): - if date not in by_date: - by_date[date] = timedelta() - by_date[date] += duration - return by_date - - def group_by_weekday(self, db_reader): - by_weekday = OrderedDict() - for weekday in Weekday: - by_weekday[weekday] = timedelta() - for _, time_from, time_to in self.enum(db_reader): - for date, duration in self._enum_dates_and_durations(time_from, time_to): - by_weekday[Weekday(date.weekday())] += duration - return by_weekday - - def group_by_hour(self, db_reader): - by_hour = OrderedDict() - for i in range(24): - by_hour[i] = timedelta() - for _, time_from, time_to in self.enum(db_reader): - for hour, duration in self._enum_hours_and_durations(time_from, time_to): - by_hour[hour] += duration - return by_hour - - @staticmethod - def _enum_dates_and_durations(time_from, time_to): - while time_from.date() != time_to.date(): - next_day = time_from.date() + timedelta(days=1) - yield time_from.date(), next_day - time_from - time_from = next_day - yield time_to.date(), time_to - time_from - - @staticmethod - def _enum_hours_and_durations(time_from, time_to): - while time_from.date() != time_to.date() or time_from.hour != time_to.hour: - next_hour = time_from.replace(minute=0, second=0) + timedelta(hours=1) - yield time_from.hour, next_hour - time_from - time_from = next_hour - yield time_to.hour, time_to - time_from - - def _insert_record(self, record): - return self._insert_user(record.to_user()) - - def _known_user(self, user): - return user.get_uid() in self._records - - def _unknown_user(self, user): - return not self._known_user(user) - - def _insert_user(self, user): - if user not in self or self[user].is_offline(): - self[user] = user - return None - if user.is_online(): - return None - streak = user, self[user].get_last_seen_time(), user.get_last_seen_time() - self[user] = user - return streak -- cgit v1.2.3