diff options
author | Egor Tensin <Egor.Tensin@gmail.com> | 2016-06-18 17:13:44 +0300 |
---|---|---|
committer | Egor Tensin <Egor.Tensin@gmail.com> | 2016-06-18 17:13:44 +0300 |
commit | 768af3bea05346d3b412351f70a95454535a8955 (patch) | |
tree | 1eb40f573311df2c2ee7a319cfc41711816a50bb | |
parent | README update (diff) | |
download | vk-scripts-768af3bea05346d3b412351f70a95454535a8955.tar.gz vk-scripts-768af3bea05346d3b412351f70a95454535a8955.zip |
implement grouping by hour of the day
-rw-r--r-- | README.md | 46 | ||||
-rw-r--r-- | img/online_duration/hour.png | bin | 0 -> 53227 bytes | |||
-rw-r--r-- | online_duration.py | 71 | ||||
-rw-r--r-- | vk/utils/tracking/online_streaks.py | 52 |
4 files changed, 145 insertions, 24 deletions
@@ -84,8 +84,9 @@ You can also get a JSON document: ] The durations are calculated on a per-user basis by default. -You can change that by supplying either `date` (to group by dates) or `weekday` -(to group by weekdays) as the `--grouping` parameter value. +You can change that by supplying either `date` (to group by dates), `weekday` +(to group by weekdays) or `hour` (to group by day hours) as the `--grouping` +parameter value. For example (assuming that both Jane and Joe spent their time online on Friday, June 17, 2016). @@ -110,6 +111,34 @@ Saturday,0:00:00 Sunday,0:00:00 ``` +``` +> online_duration.py --grouping hour db.csv +0:00:00,0:00:00 +1:00:00,0:00:00 +2:00:00,0:00:00 +3:00:00,0:00:00 +4:00:00,0:03:56 +5:00:00,0:14:14 +6:00:00,0:29:30 +7:00:00,0:31:20 +8:00:00,0:12:04 +9:00:00,0:00:00 +10:00:00,0:00:00 +11:00:00,0:23:14 +12:00:00,0:06:00 +13:00:00,0:46:19 +14:00:00,0:00:00 +15:00:00,0:00:00 +16:00:00,0:00:00 +17:00:00,0:00:00 +18:00:00,0:00:00 +19:00:00,0:00:00 +20:00:00,0:00:00 +21:00:00,0:00:00 +22:00:00,0:00:00 +23:00:00,0:00:00 +``` + In my opinion, the script's most useful feature is the ability to easily create plots that represent the text data (like in the examples above). To produce a plot, pass `img` as the `--output-format` parameter value and add @@ -127,12 +156,25 @@ a file path to write the image to. ![weekday.png] + > online_duration.py --output-format img --grouping hour db.csv hour.png + +![hour.png] + +You can limit the scope of the database by supplying a time range. +Only online durations that are within the supplied range shall then be +processed. +Set the range by specifying both or one of the `--from` and `--to` parameters. +Values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). + +All dates and times are in UTC. + [matplotlib]: http://matplotlib.org/ [track_status.py]: #track_statuspy [user.png]: img/online_duration/user.png [date.png]: img/online_duration/date.png [weekday.png]: img/online_duration/weekday.png +[hour.png]: img/online_duration/hour.png ### mutual_friends.py diff --git a/img/online_duration/hour.png b/img/online_duration/hour.png Binary files differnew file mode 100644 index 0000000..cad26b3 --- /dev/null +++ b/img/online_duration/hour.png diff --git a/online_duration.py b/online_duration.py index 637b0ec..68f716a 100644 --- a/online_duration.py +++ b/online_duration.py @@ -4,7 +4,7 @@ import csv from collections import OrderedDict -from datetime import timedelta +from datetime import datetime, timedelta, timezone from enum import Enum import json import sys @@ -20,14 +20,17 @@ class Grouping(Enum): USER = 'user' DATE = 'date' WEEKDAY = 'weekday' + HOUR = 'hour' - def enum_durations(self, db_reader): + def enum_durations(self, db_reader, date_from=None, date_to=None): if self is Grouping.USER: - return OnlineStreakEnumerator().group_by_user(db_reader) + return OnlineStreakEnumerator(date_from, date_to).group_by_user(db_reader) elif self is Grouping.DATE: - return OnlineStreakEnumerator().group_by_date(db_reader) + return OnlineStreakEnumerator(date_from, date_to).group_by_date(db_reader) elif self is Grouping.WEEKDAY: - return OnlineStreakEnumerator().group_by_weekday(db_reader) + return OnlineStreakEnumerator(date_from, date_to).group_by_weekday(db_reader) + elif self is Grouping.HOUR: + return OnlineStreakEnumerator(date_from, date_to).group_by_hour(db_reader) else: raise NotImplementedError('unsupported grouping: ' + str(self)) @@ -54,10 +57,14 @@ class OutputWriterCSV: def _weekday_to_row(weekday): return [str(weekday)] + def _hour_to_row(hour): + return [str(timedelta(hours=hour))] + _CONVERT_KEY_TO_ROW = { Grouping.USER: _user_to_row, Grouping.DATE: _date_to_row, Grouping.WEEKDAY: _weekday_to_row, + Grouping.HOUR: _hour_to_row, } @staticmethod @@ -66,8 +73,8 @@ class OutputWriterCSV: raise NotImplementedError('unsupported grouping: ' + str(grouping)) return OutputWriterCSV._CONVERT_KEY_TO_ROW[grouping](key) - def process_database(self, grouping, db_reader): - for key, duration in grouping.enum_durations(db_reader).items(): + def process_database(self, grouping, db_reader, date_from=None, date_to=None): + for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items(): row = self._key_to_row(grouping, key) row.append(str(duration)) self._write_row(row) @@ -77,6 +84,7 @@ class OutputWriterCSV: _DATE_FIELD = 'date' _WEEKDAY_FIELD = 'weekday' +_HOUR_FIELD = 'hour' class OutputWriterJSON: def __init__(self, fd=sys.stdout): @@ -98,12 +106,18 @@ class OutputWriterJSON: obj[_WEEKDAY_FIELD] = str(weekday) return obj + def _hour_to_object(hour): + obj = OrderedDict() + obj[_HOUR_FIELD] = str(timedelta(hours=hour)) + return obj + _DURATION_FIELD = 'duration' _CONVERT_KEY_TO_OBJECT = { Grouping.USER: _user_to_object, Grouping.DATE: _date_to_object, Grouping.WEEKDAY: _weekday_to_object, + Grouping.HOUR: _hour_to_object, } @staticmethod @@ -112,9 +126,9 @@ class OutputWriterJSON: raise NotImplementedError('unsupported grouping: ' + str(grouping)) return OutputWriterJSON._CONVERT_KEY_TO_OBJECT[grouping](key) - def process_database(self, grouping, db_reader): + def process_database(self, grouping, db_reader, date_from=None, date_to=None): arr = [] - for key, duration in grouping.enum_durations(db_reader).items(): + for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items(): obj = self._key_to_object(grouping, key) obj[self._DURATION_FIELD] = str(duration) arr.append(obj) @@ -173,7 +187,7 @@ class BarChartBuilder: def set_height(self, inches): self._set_size(inches, dim=1) - def plot_bars(self, bar_labels, values): + def plot_bars(self, bar_labels, values, datetime_ticks=False): numof_bars = len(bar_labels) if not numof_bars: @@ -181,12 +195,16 @@ class BarChartBuilder: self._get_bar_axis().set_tick_params(labelleft=False) return [] - self.set_height(numof_bars) + self.set_height(numof_bars / 2 if datetime_ticks else numof_bars) bar_offsets = np.arange(numof_bars) * 2 * self._BAR_HEIGHT + self._BAR_HEIGHT bar_axis_min, bar_axis_max = 0, 2 * self._BAR_HEIGHT * numof_bars - self._get_bar_axis().set_ticks(bar_offsets) + if datetime_ticks: + self._get_bar_axis().set_ticks(bar_offsets - self._BAR_HEIGHT) + else: + self._get_bar_axis().set_ticks(bar_offsets) + self._get_bar_axis().set_ticklabels(bar_labels) self.set_bar_axis_limits(bar_axis_min, bar_axis_max) @@ -211,10 +229,14 @@ class PlotBuilder: def _format_weekday(weekday): return str(weekday) + def _format_hour(hour): + return '{}:00'.format(hour) + _FORMAT_KEY = { Grouping.USER: _format_user, Grouping.DATE: _format_date, Grouping.WEEKDAY: _format_weekday, + Grouping.HOUR: _format_hour, } @staticmethod @@ -239,8 +261,8 @@ class PlotBuilder: def _extract_values(durations): return tuple(map(PlotBuilder._duration_to_seconds, durations.values())) - def process_database(self, grouping, db_reader): - durations = grouping.enum_durations(db_reader) + def process_database(self, grouping, db_reader, date_from=None, date_to=None): + durations = grouping.enum_durations(db_reader, date_from, date_to) bar_chart = BarChartBuilder() @@ -258,7 +280,7 @@ class PlotBuilder: if not labels or not max(durations): bar_chart.set_value_axis_limits(0) - bars = bar_chart.plot_bars(labels, durations) + bars = bar_chart.plot_bars(labels, durations, grouping is Grouping.HOUR) bar_chart.set_property(bars, alpha=.33) if self._fd is sys.stdout: @@ -305,6 +327,11 @@ if __name__ == '__main__': return OutputFormat(s) except ValueError: raise argparse.ArgumentError() + def date_range_limit(s): + try: + return datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) + except ValueError: + raise argparse.ArgumentError() parser.add_argument('input', type=argparse.FileType('r'), help='database path') @@ -323,9 +350,21 @@ if __name__ == '__main__': choices=tuple(fmt for fmt in OutputFormat), default=OutputFormat.CSV, help='specify output format') + parser.add_argument('--from', type=date_range_limit, default=None, + dest='date_from', + help='set the date to process database records from') + parser.add_argument('--to', type=date_range_limit, default=None, + dest='date_to', + help='set the date to process database record to') args = parser.parse_args() + if args.date_from is not None and args.date_to is not None: + if args.date_from > args.date_to: + args.date_from, args.date_to = args.date_to, args.date_from + with args.input_format.create_reader(args.input) as db_reader: output_writer = args.output_format.create_writer(args.output) - output_writer.process_database(args.grouping, db_reader) + output_writer.process_database( + args.grouping, db_reader, date_from=args.date_from, + date_to=args.date_to) diff --git a/vk/utils/tracking/online_streaks.py b/vk/utils/tracking/online_streaks.py index dfac0be..5c9aa48 100644 --- a/vk/utils/tracking/online_streaks.py +++ b/vk/utils/tracking/online_streaks.py @@ -22,8 +22,10 @@ class Weekday(Enum): return self.name[0] + self.name[1:].lower() class OnlineStreakEnumerator(MutableMapping): - def __init__(self): + def __init__(self, date_from=None, date_to=None): self._records = {} + self._date_from = date_from + self._date_to = date_to def __getitem__(self, user): return self._records[user] @@ -40,11 +42,32 @@ class OnlineStreakEnumerator(MutableMapping): def __len__(self): return len(self._records) + def _cut_period(self, streak): + user, time_from, time_to = streak + #print(user.get_first_name(), time_from, self._date_from) + if self._date_from is not None: + if time_to < self._date_from: + #print(1) + return None + if time_from < self._date_from: + #print(2) + time_from = self._date_from + if self._date_to is not None: + if time_from > self._date_to: + #print(3) + return None + if time_to > self._date_to: + #print(4) + time_to = self._date_to + return user, time_from, time_to + def enum(self, db_reader): for record in db_reader: - period = self._insert_record(record) - if period is not None: - yield period + streak = self._insert_record(record) + if streak is not None: + streak = self._cut_period(streak) + if streak is not None: + yield streak def group_by_user(self, db_reader): by_user = {} @@ -72,6 +95,15 @@ class OnlineStreakEnumerator(MutableMapping): by_weekday[Weekday(date.weekday())] += duration return by_weekday + def group_by_hour(self, db_reader): + by_hour = OrderedDict() + for i in range(24): + by_hour[i] = timedelta() + for _, time_from, time_to in self.enum(db_reader): + for hour, duration in self._enum_hours_and_durations(time_from, time_to): + by_hour[hour] += duration + return by_hour + @staticmethod def _enum_dates_and_durations(time_from, time_to): while time_from.date() != time_to.date(): @@ -80,6 +112,14 @@ class OnlineStreakEnumerator(MutableMapping): time_from = next_day yield time_to.date(), time_to - time_from + @staticmethod + def _enum_hours_and_durations(time_from, time_to): + while time_from.date() != time_to.date() or time_from.hour != time_to.hour: + next_hour = time_from.replace(minute=0, second=0) + timedelta(hours=1) + yield time_from.hour, next_hour - time_from + time_from = next_hour + yield time_to.hour, time_to - time_from + def _insert_record(self, record): return self._insert_user(record.to_user()) @@ -95,6 +135,6 @@ class OnlineStreakEnumerator(MutableMapping): return None if user.is_online(): return None - period = user, self[user].get_last_seen_time(), user.get_last_seen_time() + streak = user, self[user].get_last_seen_time(), user.get_last_seen_time() self[user] = user - return period + return streak |