From 00a87519cce24d130f193cf8d2a540469f38efdc Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Tue, 19 Jul 2016 23:29:23 +0300 Subject: online_duration.py -> online_sessions.py --- README.md | 6 +- bin/online_duration.py | 436 ------------------------------------------------ bin/online_sessions.py | 436 ++++++++++++++++++++++++++++++++++++++++++++++++ docs/online_duration.md | 171 ------------------- docs/online_sessions.md | 171 +++++++++++++++++++ docs/track_status.md | 4 +- 6 files changed, 612 insertions(+), 612 deletions(-) delete mode 100644 bin/online_duration.py create mode 100644 bin/online_sessions.py delete mode 100644 docs/online_duration.md create mode 100644 docs/online_sessions.md diff --git a/README.md b/README.md index 72537c1..6744ea2 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Prerequisites ------------- Python 3.4 or higher is required. -Additionally, [online_duration.py] uses the excellent [matplotlib] plotting +Additionally, [online_sessions.py] uses the excellent [matplotlib] plotting library. The versions below have been verified to work properly. @@ -32,12 +32,12 @@ The supplied scripts are listed below. * [mutual_friends.py] — Learn who your ex and her new boyfriend are both friends with. * [track_status.py] — Track when people go online/offline. -* [online_duration.py] — View/visualize the amount of time people spend +* [online_sessions.py] — View/visualize the amount of time people spend online. [mutual_friends.py]: docs/mutual_friends.md [track_status.py]: docs/track_status.md -[online_duration.py]: docs/online_duration.md +[online_sessions.py]: docs/online_sessions.md License ------- diff --git a/bin/online_duration.py b/bin/online_duration.py deleted file mode 100644 index ac9251b..0000000 --- a/bin/online_duration.py +++ /dev/null @@ -1,436 +0,0 @@ -# Copyright 2016 Egor Tensin -# This file is licensed under the terms of the MIT License. -# See LICENSE.txt for details. - -import argparse -import csv -from collections import OrderedDict -from datetime import datetime, timedelta, timezone -from enum import Enum -import json -import sys - -import matplotlib.pyplot as plt -import numpy as np - -from vk.tracking import OnlineSessionEnumerator -from vk.tracking.db import Format as DatabaseFormat -from vk.user import UserField - -class GroupBy(Enum): - USER = 'user' - DATE = 'date' - WEEKDAY = 'weekday' - HOUR = 'hour' - - def group(self, db_reader, time_from=None, time_to=None): - online_streaks = OnlineSessionEnumerator(time_from, time_to) - if self is GroupBy.USER: - return online_streaks.group_by_user(db_reader) - elif self is GroupBy.DATE: - return online_streaks.group_by_date(db_reader) - elif self is GroupBy.WEEKDAY: - return online_streaks.group_by_weekday(db_reader) - elif self is GroupBy.HOUR: - return online_streaks.group_by_hour(db_reader) - else: - raise NotImplementedError('unsupported grouping: ' + str(self)) - - def __str__(self): - return self.value - -_OUTPUT_USER_FIELDS = ( - UserField.UID, - UserField.FIRST_NAME, - UserField.LAST_NAME, - UserField.DOMAIN, -) - -class OutputConverterCSV: - @staticmethod - def convert_user(user): - return [user[field] for field in _OUTPUT_USER_FIELDS] - - @staticmethod - def convert_date(date): - return [str(date)] - - @staticmethod - def convert_weekday(weekday): - return [str(weekday)] - - @staticmethod - def convert_hour(hour): - return [str(timedelta(hours=hour))] - -class OutputWriterCSV: - def __init__(self, fd=sys.stdout): - self._writer = csv.writer(fd, lineterminator='\n') - - _CONVERT_KEY = { - GroupBy.USER: OutputConverterCSV.convert_user, - GroupBy.DATE: OutputConverterCSV.convert_date, - GroupBy.WEEKDAY: OutputConverterCSV.convert_weekday, - GroupBy.HOUR: OutputConverterCSV.convert_hour, - } - - @staticmethod - def _key_to_row(group_by, key): - if group_by not in OutputWriterCSV._CONVERT_KEY: - raise NotImplementedError('unsupported grouping: ' + str(group_by)) - return OutputWriterCSV._CONVERT_KEY[group_by](key) - - def process_database(self, group_by, db_reader, time_from=None, time_to=None): - for key, duration in group_by.group(db_reader, time_from, time_to).items(): - row = self._key_to_row(group_by, key) - row.append(str(duration)) - self._write_row(row) - - def _write_row(self, row): - self._writer.writerow(row) - -class OutputConverterJSON: - _DATE_FIELD = 'date' - _WEEKDAY_FIELD = 'weekday' - _HOUR_FIELD = 'hour' - - assert _DATE_FIELD not in map(str, _OUTPUT_USER_FIELDS) - assert _WEEKDAY_FIELD not in map(str, _OUTPUT_USER_FIELDS) - assert _HOUR_FIELD not in map(str, _OUTPUT_USER_FIELDS) - - @staticmethod - def convert_user(user): - obj = OrderedDict() - for field in _OUTPUT_USER_FIELDS: - obj[str(field)] = user[field] - return obj - - @staticmethod - def convert_date(date): - obj = OrderedDict() - obj[OutputConverterJSON._DATE_FIELD] = str(date) - return obj - - @staticmethod - def convert_weekday(weekday): - obj = OrderedDict() - obj[OutputConverterJSON._WEEKDAY_FIELD] = str(weekday) - return obj - - @staticmethod - def convert_hour(hour): - obj = OrderedDict() - obj[OutputConverterJSON._HOUR_FIELD] = str(timedelta(hours=hour)) - return obj - -class OutputWriterJSON: - def __init__(self, fd=sys.stdout): - self._fd = fd - - _DURATION_FIELD = 'duration' - - assert _DURATION_FIELD not in map(str, _OUTPUT_USER_FIELDS) - - _CONVERT_KEY = { - GroupBy.USER: OutputConverterJSON.convert_user, - GroupBy.DATE: OutputConverterJSON.convert_date, - GroupBy.WEEKDAY: OutputConverterJSON.convert_weekday, - GroupBy.HOUR: OutputConverterJSON.convert_hour, - } - - @staticmethod - def _key_to_object(group_by, key): - if not group_by in OutputWriterJSON._CONVERT_KEY: - raise NotImplementedError('unsupported grouping: ' + str(group_by)) - return OutputWriterJSON._CONVERT_KEY[group_by](key) - - def _write(self, x): - self._fd.write(json.dumps(x, indent=3, ensure_ascii=False)) - self._fd.write('\n') - - def process_database(self, group_by, db_reader, time_from=None, time_to=None): - arr = [] - for key, duration in group_by.group(db_reader, time_from, time_to).items(): - obj = self._key_to_object(group_by, key) - obj[self._DURATION_FIELD] = str(duration) - arr.append(obj) - self._write(arr) - -class BarChartBuilder: - _BAR_HEIGHT = 1. - - def __init__(self): - self._fig, self._ax = plt.subplots() - - def set_title(self, title): - self._ax.set_title(title) - - def _get_bar_axis(self): - return self._ax.get_yaxis() - - def _get_value_axis(self): - return self._ax.get_xaxis() - - def set_bar_axis_limits(self, start=None, end=None): - self._ax.set_ylim(bottom=start, top=end) - - def set_value_axis_limits(self, start=None, end=None): - self._ax.set_xlim(left=start, right=end) - - def set_value_grid(self): - self._get_value_axis().grid() - - def get_bar_labels(self): - return self._get_bar_axis().get_ticklabels() - - def get_value_labels(self): - return self._get_value_axis().get_ticklabels() - - def set_value_label_formatter(self, fn): - from matplotlib.ticker import FuncFormatter - self._get_value_axis().set_major_formatter(FuncFormatter(fn)) - - def set_integer_values_only(self): - from matplotlib.ticker import MaxNLocator - self._get_value_axis().set_major_locator(MaxNLocator(integer=True)) - - @staticmethod - def set_property(*args, **kwargs): - plt.setp(*args, **kwargs) - - def _set_size(self, inches, dim=0): - fig_size = self._fig.get_size_inches() - assert len(fig_size) == 2 - fig_size[dim] = inches - self._fig.set_size_inches(fig_size, forward=True) - - def set_width(self, inches): - self._set_size(inches) - - def set_height(self, inches): - self._set_size(inches, dim=1) - - def plot_bars( - self, bar_labels, bar_lengths, - bars_between_ticks=False, - inches_per_bar=1): - - numof_bars = len(bar_labels) - - if not numof_bars: - self.set_height(1) - self._get_bar_axis().set_tick_params(labelleft=False) - return [] - - self.set_height(inches_per_bar * numof_bars) - - bar_offsets = np.arange(numof_bars) * 2 * self._BAR_HEIGHT + self._BAR_HEIGHT - - if bars_between_ticks: - self._get_bar_axis().set_ticks(bar_offsets - self._BAR_HEIGHT) - else: - self._get_bar_axis().set_ticks(bar_offsets) - - bar_axis_min = 0 - bar_axis_max = 2 * self._BAR_HEIGHT * numof_bars - self.set_bar_axis_limits(bar_axis_min, bar_axis_max) - - self._get_bar_axis().set_ticklabels(bar_labels) - - return self._ax.barh( - bar_offsets, bar_lengths, align='center', height=self._BAR_HEIGHT) - - @staticmethod - def show(): - plt.show() - - def save(self, path): - self._fig.savefig(path, bbox_inches='tight') - -class OutputConverterPlot: - @staticmethod - def convert_user(user): - return '{}\n{}'.format(user.get_first_name(), user.get_last_name()) - - @staticmethod - def convert_date(date): - return str(date) - - @staticmethod - def convert_weekday(weekday): - return str(weekday) - - @staticmethod - def convert_hour(hour): - return '{}:00'.format(hour) - -class OutputWriterPlot: - def __init__(self, fd=sys.stdout): - self._fd = fd - - TITLE = 'How much time people spend online' - - _FORMAT_KEY = { - GroupBy.USER: OutputConverterPlot.convert_user, - GroupBy.DATE: OutputConverterPlot.convert_date, - GroupBy.WEEKDAY: OutputConverterPlot.convert_weekday, - GroupBy.HOUR: OutputConverterPlot.convert_hour, - } - - @staticmethod - def _format_key(group_by, key): - if group_by not in OutputWriterPlot._FORMAT_KEY: - raise NotImplementedError('unsupported grouping: ' + str(group_by)) - return OutputWriterPlot._FORMAT_KEY[group_by](key) - - @staticmethod - def _format_duration(seconds, _): - return str(timedelta(seconds=seconds)) - - @staticmethod - def _duration_to_seconds(td): - return td.total_seconds() - - @staticmethod - def _extract_labels(group_by, durations): - return tuple(map(lambda key: OutputWriterPlot._format_key(group_by, key), durations.keys())) - - @staticmethod - def _extract_values(durations): - return tuple(map(OutputWriterPlot._duration_to_seconds, durations.values())) - - def process_database( - self, group_by, db_reader, time_from=None, time_to=None): - - durations = group_by.group(db_reader, time_from, time_to) - - bar_chart = BarChartBuilder() - - bar_chart.set_title(OutputWriterPlot.TITLE) - bar_chart.set_value_grid() - - bar_chart.set_integer_values_only() - bar_chart.set_property( - bar_chart.get_value_labels(), fontsize='small', rotation=30) - bar_chart.set_value_label_formatter(self._format_duration) - - labels = self._extract_labels(group_by, durations) - durations = self._extract_values(durations) - - if not labels or not max(durations): - bar_chart.set_value_axis_limits(0) - - bars = bar_chart.plot_bars( - labels, durations, - bars_between_ticks=group_by is GroupBy.HOUR, - inches_per_bar=.5 if group_by is GroupBy.HOUR else 1) - bar_chart.set_property(bars, alpha=.33) - - if self._fd is sys.stdout: - bar_chart.show() - else: - bar_chart.save(self._fd) - -class OutputFormat(Enum): - CSV = 'csv' - JSON = 'json' - PLOT = 'plot' - - def create_writer(self, fd): - if self is OutputFormat.CSV: - return OutputWriterCSV(fd) - elif self is OutputFormat.JSON: - return OutputWriterJSON(fd) - elif self is OutputFormat.PLOT: - return OutputWriterPlot(fd) - else: - raise NotImplementedError('unsupported output format: ' + str(self)) - - def __str__(self): - return self.value - -def _parse_group_by(s): - try: - return GroupBy(s) - except ValueError: - raise argparse.ArgumentTypeError('invalid "group by" value: ' + s) - -def _parse_database_format(s): - try: - return DatabaseFormat(s) - except ValueError: - raise argparse.ArgumentTypeError('invalid database format: ' + s) - -def _parse_output_format(s): - try: - return OutputFormat(s) - except ValueError: - raise argparse.ArgumentTypeError('invalid output format: ' + s) - -_DATE_RANGE_LIMIT_FORMAT = '%Y-%m-%dT%H:%M:%SZ' - -def _parse_date_range_limit(s): - try: - dt = datetime.strptime(s, _DATE_RANGE_LIMIT_FORMAT) - return dt.replace(tzinfo=timezone.utc) - except ValueError: - msg = 'invalid date range limit (must be in the \'{}\' format): {}' - raise argparse.ArgumentTypeError( - msg.format(_DATE_RANGE_LIMIT_FORMAT, s)) - -def _parse_args(args=sys.argv): - parser = argparse.ArgumentParser( - description='View/visualize the amount of time people spend online.') - - parser.add_argument('db_fd', metavar='input', - type=argparse.FileType('r', encoding='utf-8'), - help='database file path') - parser.add_argument('fd', metavar='output', nargs='?', - type=argparse.FileType('w', encoding='utf-8'), - default=sys.stdout, - help='output file path (standard output by default)') - parser.add_argument('-g', '--group-by', - type=_parse_group_by, - choices=GroupBy, - default=GroupBy.USER, - help='group online sessions by user/date/etc.') - parser.add_argument('-i', '--input-format', dest='db_fmt', - type=_parse_database_format, - default=DatabaseFormat.CSV, - choices=DatabaseFormat, - help='specify database format') - parser.add_argument('-o', '--output-format', dest='fmt', - type=_parse_output_format, - choices=OutputFormat, - default=OutputFormat.CSV, - help='specify output format') - parser.add_argument('-a', '--from', dest='time_from', - type=_parse_date_range_limit, default=None, - help='discard online activity prior to this moment') - parser.add_argument('-b', '--to', dest='time_to', - type=_parse_date_range_limit, default=None, - help='discard online activity after this moment') - - return parser.parse_args(args[1:]) - -def write_online_duration( - db_fd, db_fmt=DatabaseFormat.CSV, - fd=sys.stdout, fmt=OutputFormat.CSV, - group_by=GroupBy.USER, - time_from=None, time_to=None): - - if time_from is not None and time_to is not None: - if time_from > time_to: - time_from, time_to = time_to, time_from - - with db_fmt.create_reader(db_fd) as db_reader: - output_writer = fmt.create_writer(fd) - output_writer.process_database( - group_by, db_reader, time_from=time_from, time_to=time_to) - -def main(args=sys.argv): - args = _parse_args(args) - write_online_duration(**vars(args)) - -if __name__ == '__main__': - main() diff --git a/bin/online_sessions.py b/bin/online_sessions.py new file mode 100644 index 0000000..3732f4c --- /dev/null +++ b/bin/online_sessions.py @@ -0,0 +1,436 @@ +# Copyright 2016 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +import argparse +import csv +from collections import OrderedDict +from datetime import datetime, timedelta, timezone +from enum import Enum +import json +import sys + +import matplotlib.pyplot as plt +import numpy as np + +from vk.tracking import OnlineSessionEnumerator +from vk.tracking.db import Format as DatabaseFormat +from vk.user import UserField + +class GroupBy(Enum): + USER = 'user' + DATE = 'date' + WEEKDAY = 'weekday' + HOUR = 'hour' + + def group(self, db_reader, time_from=None, time_to=None): + online_streaks = OnlineSessionEnumerator(time_from, time_to) + if self is GroupBy.USER: + return online_streaks.group_by_user(db_reader) + elif self is GroupBy.DATE: + return online_streaks.group_by_date(db_reader) + elif self is GroupBy.WEEKDAY: + return online_streaks.group_by_weekday(db_reader) + elif self is GroupBy.HOUR: + return online_streaks.group_by_hour(db_reader) + else: + raise NotImplementedError('unsupported grouping: ' + str(self)) + + def __str__(self): + return self.value + +_OUTPUT_USER_FIELDS = ( + UserField.UID, + UserField.FIRST_NAME, + UserField.LAST_NAME, + UserField.DOMAIN, +) + +class OutputConverterCSV: + @staticmethod + def convert_user(user): + return [user[field] for field in _OUTPUT_USER_FIELDS] + + @staticmethod + def convert_date(date): + return [str(date)] + + @staticmethod + def convert_weekday(weekday): + return [str(weekday)] + + @staticmethod + def convert_hour(hour): + return [str(timedelta(hours=hour))] + +class OutputWriterCSV: + def __init__(self, fd=sys.stdout): + self._writer = csv.writer(fd, lineterminator='\n') + + _CONVERT_KEY = { + GroupBy.USER: OutputConverterCSV.convert_user, + GroupBy.DATE: OutputConverterCSV.convert_date, + GroupBy.WEEKDAY: OutputConverterCSV.convert_weekday, + GroupBy.HOUR: OutputConverterCSV.convert_hour, + } + + @staticmethod + def _key_to_row(group_by, key): + if group_by not in OutputWriterCSV._CONVERT_KEY: + raise NotImplementedError('unsupported grouping: ' + str(group_by)) + return OutputWriterCSV._CONVERT_KEY[group_by](key) + + def process_database(self, group_by, db_reader, time_from=None, time_to=None): + for key, duration in group_by.group(db_reader, time_from, time_to).items(): + row = self._key_to_row(group_by, key) + row.append(str(duration)) + self._write_row(row) + + def _write_row(self, row): + self._writer.writerow(row) + +class OutputConverterJSON: + _DATE_FIELD = 'date' + _WEEKDAY_FIELD = 'weekday' + _HOUR_FIELD = 'hour' + + assert _DATE_FIELD not in map(str, _OUTPUT_USER_FIELDS) + assert _WEEKDAY_FIELD not in map(str, _OUTPUT_USER_FIELDS) + assert _HOUR_FIELD not in map(str, _OUTPUT_USER_FIELDS) + + @staticmethod + def convert_user(user): + obj = OrderedDict() + for field in _OUTPUT_USER_FIELDS: + obj[str(field)] = user[field] + return obj + + @staticmethod + def convert_date(date): + obj = OrderedDict() + obj[OutputConverterJSON._DATE_FIELD] = str(date) + return obj + + @staticmethod + def convert_weekday(weekday): + obj = OrderedDict() + obj[OutputConverterJSON._WEEKDAY_FIELD] = str(weekday) + return obj + + @staticmethod + def convert_hour(hour): + obj = OrderedDict() + obj[OutputConverterJSON._HOUR_FIELD] = str(timedelta(hours=hour)) + return obj + +class OutputWriterJSON: + def __init__(self, fd=sys.stdout): + self._fd = fd + + _DURATION_FIELD = 'duration' + + assert _DURATION_FIELD not in map(str, _OUTPUT_USER_FIELDS) + + _CONVERT_KEY = { + GroupBy.USER: OutputConverterJSON.convert_user, + GroupBy.DATE: OutputConverterJSON.convert_date, + GroupBy.WEEKDAY: OutputConverterJSON.convert_weekday, + GroupBy.HOUR: OutputConverterJSON.convert_hour, + } + + @staticmethod + def _key_to_object(group_by, key): + if not group_by in OutputWriterJSON._CONVERT_KEY: + raise NotImplementedError('unsupported grouping: ' + str(group_by)) + return OutputWriterJSON._CONVERT_KEY[group_by](key) + + def _write(self, x): + self._fd.write(json.dumps(x, indent=3, ensure_ascii=False)) + self._fd.write('\n') + + def process_database(self, group_by, db_reader, time_from=None, time_to=None): + arr = [] + for key, duration in group_by.group(db_reader, time_from, time_to).items(): + obj = self._key_to_object(group_by, key) + obj[self._DURATION_FIELD] = str(duration) + arr.append(obj) + self._write(arr) + +class BarChartBuilder: + _BAR_HEIGHT = 1. + + def __init__(self): + self._fig, self._ax = plt.subplots() + + def set_title(self, title): + self._ax.set_title(title) + + def _get_bar_axis(self): + return self._ax.get_yaxis() + + def _get_value_axis(self): + return self._ax.get_xaxis() + + def set_bar_axis_limits(self, start=None, end=None): + self._ax.set_ylim(bottom=start, top=end) + + def set_value_axis_limits(self, start=None, end=None): + self._ax.set_xlim(left=start, right=end) + + def set_value_grid(self): + self._get_value_axis().grid() + + def get_bar_labels(self): + return self._get_bar_axis().get_ticklabels() + + def get_value_labels(self): + return self._get_value_axis().get_ticklabels() + + def set_value_label_formatter(self, fn): + from matplotlib.ticker import FuncFormatter + self._get_value_axis().set_major_formatter(FuncFormatter(fn)) + + def set_integer_values_only(self): + from matplotlib.ticker import MaxNLocator + self._get_value_axis().set_major_locator(MaxNLocator(integer=True)) + + @staticmethod + def set_property(*args, **kwargs): + plt.setp(*args, **kwargs) + + def _set_size(self, inches, dim=0): + fig_size = self._fig.get_size_inches() + assert len(fig_size) == 2 + fig_size[dim] = inches + self._fig.set_size_inches(fig_size, forward=True) + + def set_width(self, inches): + self._set_size(inches) + + def set_height(self, inches): + self._set_size(inches, dim=1) + + def plot_bars( + self, bar_labels, bar_lengths, + bars_between_ticks=False, + inches_per_bar=1): + + numof_bars = len(bar_labels) + + if not numof_bars: + self.set_height(1) + self._get_bar_axis().set_tick_params(labelleft=False) + return [] + + self.set_height(inches_per_bar * numof_bars) + + bar_offsets = np.arange(numof_bars) * 2 * self._BAR_HEIGHT + self._BAR_HEIGHT + + if bars_between_ticks: + self._get_bar_axis().set_ticks(bar_offsets - self._BAR_HEIGHT) + else: + self._get_bar_axis().set_ticks(bar_offsets) + + bar_axis_min = 0 + bar_axis_max = 2 * self._BAR_HEIGHT * numof_bars + self.set_bar_axis_limits(bar_axis_min, bar_axis_max) + + self._get_bar_axis().set_ticklabels(bar_labels) + + return self._ax.barh( + bar_offsets, bar_lengths, align='center', height=self._BAR_HEIGHT) + + @staticmethod + def show(): + plt.show() + + def save(self, path): + self._fig.savefig(path, bbox_inches='tight') + +class OutputConverterPlot: + @staticmethod + def convert_user(user): + return '{}\n{}'.format(user.get_first_name(), user.get_last_name()) + + @staticmethod + def convert_date(date): + return str(date) + + @staticmethod + def convert_weekday(weekday): + return str(weekday) + + @staticmethod + def convert_hour(hour): + return '{}:00'.format(hour) + +class OutputWriterPlot: + def __init__(self, fd=sys.stdout): + self._fd = fd + + TITLE = 'How much time people spend online' + + _FORMAT_KEY = { + GroupBy.USER: OutputConverterPlot.convert_user, + GroupBy.DATE: OutputConverterPlot.convert_date, + GroupBy.WEEKDAY: OutputConverterPlot.convert_weekday, + GroupBy.HOUR: OutputConverterPlot.convert_hour, + } + + @staticmethod + def _format_key(group_by, key): + if group_by not in OutputWriterPlot._FORMAT_KEY: + raise NotImplementedError('unsupported grouping: ' + str(group_by)) + return OutputWriterPlot._FORMAT_KEY[group_by](key) + + @staticmethod + def _format_duration(seconds, _): + return str(timedelta(seconds=seconds)) + + @staticmethod + def _duration_to_seconds(td): + return td.total_seconds() + + @staticmethod + def _extract_labels(group_by, durations): + return tuple(map(lambda key: OutputWriterPlot._format_key(group_by, key), durations.keys())) + + @staticmethod + def _extract_values(durations): + return tuple(map(OutputWriterPlot._duration_to_seconds, durations.values())) + + def process_database( + self, group_by, db_reader, time_from=None, time_to=None): + + durations = group_by.group(db_reader, time_from, time_to) + + bar_chart = BarChartBuilder() + + bar_chart.set_title(OutputWriterPlot.TITLE) + bar_chart.set_value_grid() + + bar_chart.set_integer_values_only() + bar_chart.set_property( + bar_chart.get_value_labels(), fontsize='small', rotation=30) + bar_chart.set_value_label_formatter(self._format_duration) + + labels = self._extract_labels(group_by, durations) + durations = self._extract_values(durations) + + if not labels or not max(durations): + bar_chart.set_value_axis_limits(0) + + bars = bar_chart.plot_bars( + labels, durations, + bars_between_ticks=group_by is GroupBy.HOUR, + inches_per_bar=.5 if group_by is GroupBy.HOUR else 1) + bar_chart.set_property(bars, alpha=.33) + + if self._fd is sys.stdout: + bar_chart.show() + else: + bar_chart.save(self._fd) + +class OutputFormat(Enum): + CSV = 'csv' + JSON = 'json' + PLOT = 'plot' + + def create_writer(self, fd): + if self is OutputFormat.CSV: + return OutputWriterCSV(fd) + elif self is OutputFormat.JSON: + return OutputWriterJSON(fd) + elif self is OutputFormat.PLOT: + return OutputWriterPlot(fd) + else: + raise NotImplementedError('unsupported output format: ' + str(self)) + + def __str__(self): + return self.value + +def _parse_group_by(s): + try: + return GroupBy(s) + except ValueError: + raise argparse.ArgumentTypeError('invalid "group by" value: ' + s) + +def _parse_database_format(s): + try: + return DatabaseFormat(s) + except ValueError: + raise argparse.ArgumentTypeError('invalid database format: ' + s) + +def _parse_output_format(s): + try: + return OutputFormat(s) + except ValueError: + raise argparse.ArgumentTypeError('invalid output format: ' + s) + +_DATE_RANGE_LIMIT_FORMAT = '%Y-%m-%dT%H:%M:%SZ' + +def _parse_date_range_limit(s): + try: + dt = datetime.strptime(s, _DATE_RANGE_LIMIT_FORMAT) + return dt.replace(tzinfo=timezone.utc) + except ValueError: + msg = 'invalid date range limit (must be in the \'{}\' format): {}' + raise argparse.ArgumentTypeError( + msg.format(_DATE_RANGE_LIMIT_FORMAT, s)) + +def _parse_args(args=sys.argv): + parser = argparse.ArgumentParser( + description='View/visualize the amount of time people spend online.') + + parser.add_argument('db_fd', metavar='input', + type=argparse.FileType('r', encoding='utf-8'), + help='database file path') + parser.add_argument('fd', metavar='output', nargs='?', + type=argparse.FileType('w', encoding='utf-8'), + default=sys.stdout, + help='output file path (standard output by default)') + parser.add_argument('-g', '--group-by', + type=_parse_group_by, + choices=GroupBy, + default=GroupBy.USER, + help='group online sessions by user/date/etc.') + parser.add_argument('-i', '--input-format', dest='db_fmt', + type=_parse_database_format, + default=DatabaseFormat.CSV, + choices=DatabaseFormat, + help='specify database format') + parser.add_argument('-o', '--output-format', dest='fmt', + type=_parse_output_format, + choices=OutputFormat, + default=OutputFormat.CSV, + help='specify output format') + parser.add_argument('-a', '--from', dest='time_from', + type=_parse_date_range_limit, default=None, + help='discard online activity prior to this moment') + parser.add_argument('-b', '--to', dest='time_to', + type=_parse_date_range_limit, default=None, + help='discard online activity after this moment') + + return parser.parse_args(args[1:]) + +def process_online_sessions( + db_fd, db_fmt=DatabaseFormat.CSV, + fd=sys.stdout, fmt=OutputFormat.CSV, + group_by=GroupBy.USER, + time_from=None, time_to=None): + + if time_from is not None and time_to is not None: + if time_from > time_to: + time_from, time_to = time_to, time_from + + with db_fmt.create_reader(db_fd) as db_reader: + output_writer = fmt.create_writer(fd) + output_writer.process_database( + group_by, db_reader, time_from=time_from, time_to=time_to) + +def main(args=sys.argv): + args = _parse_args(args) + process_online_sessions(**vars(args)) + +if __name__ == '__main__': + main() diff --git a/docs/online_duration.md b/docs/online_duration.md deleted file mode 100644 index f5c703b..0000000 --- a/docs/online_duration.md +++ /dev/null @@ -1,171 +0,0 @@ -online_duration.py -================== - -View/visualize the amount of time people spend online. - -Usage ------ - -Run from the top-level directory using `python -m`: - -``` -> python -m bin.online_duration -h -usage: online_duration.py [-h] [-g {user,date,weekday,hour}] - [-i {csv,log,null}] [-o {csv,json,plot}] - [-a TIME_FROM] [-b TIME_TO] - input [output] -``` - -This script additionally requires [matplotlib] to be installed. - -Analyze the database produced by [track_status.py] and calculate the total -amount of time people spent online. -For example (assuming the database in "db.csv" was generated by -[track_status.py] before): - -``` -> python -m bin.online_duration db.csv -89497105,John,Smith,john.smith,0:12:31 -3698577,Jane,Smith,jane.smith,1:34:46 -``` - -In the example above, "John Smith" and "Jane Smith" spent approx. 13 and 95 -minutes online respectively. - -The output format is CSV (comma-separated values) by default. -You can also get a JSON document: - -``` -> python -m bin.online_duration --output-format json db.csv -[ - { - "uid": 89497105, - "first_name": "John", - "last_name": "Smith", - "domain": "john.smith", - "duration": "0:12:31" - }, - { - "uid": 3698577, - "first_name": "Jane", - "last_name": "Smith", - "domain": "jane.smith", - "duration": "1:34:46" - } -] -``` - -The durations are calculated on a per-user basis by default. -You can change that by supplying either `date` (to group by dates), `weekday` -(to group by weekdays) or `hour` (to group by day hours) as the `--group-by` -parameter value. -For example (assuming that both Jane and Joe spent their time online on Friday, -June 17, 2016). - -``` -> python -m bin.online_duration --output-format json --group-by date db.csv -[ - { - "date": "2016-06-17", - "duration": "1:47:17" - } -] -``` - -``` -> python -m bin.online_duration --output-format csv --group-by weekday db.csv -Monday,0:00:00 -Tuesday,0:00:00 -Wednesday,0:00:00 -Thursday,0:00:00 -Friday,1:47:17 -Saturday,0:00:00 -Sunday,0:00:00 -``` - -``` -> python -m bin.online_duration --group-by hour db.csv -0:00:00,0:00:00 -1:00:00,0:00:00 -2:00:00,0:00:00 -3:00:00,0:00:00 -4:00:00,0:03:56 -5:00:00,0:14:14 -6:00:00,0:29:30 -7:00:00,0:31:20 -8:00:00,0:12:04 -9:00:00,0:00:00 -10:00:00,0:00:00 -11:00:00,0:23:14 -12:00:00,0:06:00 -13:00:00,0:46:19 -14:00:00,0:00:00 -15:00:00,0:00:00 -16:00:00,0:00:00 -17:00:00,0:00:00 -18:00:00,0:00:00 -19:00:00,0:00:00 -20:00:00,0:00:00 -21:00:00,0:00:00 -22:00:00,0:00:00 -23:00:00,0:00:00 -``` - -In my opinion, the script's most useful feature is its ability to easily create -plots that represent this data (like in the examples above). -To produce a plot, pass `plot` as the `--output-format` parameter value and add -a file path to write the image to. - -``` -> python -m bin.online_duration --output-format plot db.csv user.png -``` - -![user.png] - -``` -> python -m bin.online_duration --output-format plot --group-by date db.csv date.png -``` - -![date.png] - -``` -> python -m bin.online_duration --output-format plot --group-by weekday db.csv weekday.png -``` - -![weekday.png] - -``` -> python -m bin.online_duration --output-format plot --group-by hour db.csv hour.png -``` - -![hour.png] - -You can limit the scope of the database by supplying a time range. -Only online sessions that overlap with this range shall then be processed. -Set the range by specifying both or one of the `--from` and `--to` parameters. -The values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). - -All dates and times are in UTC. - -[matplotlib]: http://matplotlib.org/ -[track_status.py]: track_status.md - -[user.png]: images/user.png -[date.png]: images/date.png -[weekday.png]: images/weekday.png -[hour.png]: images/hour.png - -Known issues ------------- - -* When people go online using the web version and don't visit other pages over -time (for example, just listening to music), they appear offline. -Hence the 0:00:00 durations you might sometimes encounter. -This might also happen using other clients. - -See also --------- - -* [License] - -[License]: ../README.md#license diff --git a/docs/online_sessions.md b/docs/online_sessions.md new file mode 100644 index 0000000..e6b4b08 --- /dev/null +++ b/docs/online_sessions.md @@ -0,0 +1,171 @@ +online_sessions.py +================== + +View/visualize the amount of time people spend online. + +Usage +----- + +Run from the top-level directory using `python -m`: + +``` +> python -m bin.online_sessions -h +usage: online_sessions.py [-h] [-g {user,date,weekday,hour}] + [-i {csv,log,null}] [-o {csv,json,plot}] + [-a TIME_FROM] [-b TIME_TO] + input [output] +``` + +This script additionally requires [matplotlib] to be installed. + +Analyze the database produced by [track_status.py] and calculate the total +amount of time people spent online. +For example (assuming the database in "db.csv" was generated by +[track_status.py] before): + +``` +> python -m bin.online_sessions db.csv +89497105,John,Smith,john.smith,0:12:31 +3698577,Jane,Smith,jane.smith,1:34:46 +``` + +In the example above, "John Smith" and "Jane Smith" spent approx. 13 and 95 +minutes online respectively. + +The output format is CSV (comma-separated values) by default. +You can also get a JSON document: + +``` +> python -m bin.online_sessions --output-format json db.csv +[ + { + "uid": 89497105, + "first_name": "John", + "last_name": "Smith", + "domain": "john.smith", + "duration": "0:12:31" + }, + { + "uid": 3698577, + "first_name": "Jane", + "last_name": "Smith", + "domain": "jane.smith", + "duration": "1:34:46" + } +] +``` + +The durations are calculated on a per-user basis by default. +You can change that by supplying either `date` (to group by dates), `weekday` +(to group by weekdays) or `hour` (to group by day hours) as the `--group-by` +parameter value. +For example (assuming that both Jane and Joe spent their time online on Friday, +June 17, 2016). + +``` +> python -m bin.online_sessions --output-format json --group-by date db.csv +[ + { + "date": "2016-06-17", + "duration": "1:47:17" + } +] +``` + +``` +> python -m bin.online_sessions --output-format csv --group-by weekday db.csv +Monday,0:00:00 +Tuesday,0:00:00 +Wednesday,0:00:00 +Thursday,0:00:00 +Friday,1:47:17 +Saturday,0:00:00 +Sunday,0:00:00 +``` + +``` +> python -m bin.online_sessions --group-by hour db.csv +0:00:00,0:00:00 +1:00:00,0:00:00 +2:00:00,0:00:00 +3:00:00,0:00:00 +4:00:00,0:03:56 +5:00:00,0:14:14 +6:00:00,0:29:30 +7:00:00,0:31:20 +8:00:00,0:12:04 +9:00:00,0:00:00 +10:00:00,0:00:00 +11:00:00,0:23:14 +12:00:00,0:06:00 +13:00:00,0:46:19 +14:00:00,0:00:00 +15:00:00,0:00:00 +16:00:00,0:00:00 +17:00:00,0:00:00 +18:00:00,0:00:00 +19:00:00,0:00:00 +20:00:00,0:00:00 +21:00:00,0:00:00 +22:00:00,0:00:00 +23:00:00,0:00:00 +``` + +In my opinion, the script's most useful feature is its ability to easily create +plots that represent this data (like in the examples above). +To produce a plot, pass `plot` as the `--output-format` parameter value and add +a file path to write the image to. + +``` +> python -m bin.online_sessions --output-format plot db.csv user.png +``` + +![user.png] + +``` +> python -m bin.online_sessions --output-format plot --group-by date db.csv date.png +``` + +![date.png] + +``` +> python -m bin.online_sessions --output-format plot --group-by weekday db.csv weekday.png +``` + +![weekday.png] + +``` +> python -m bin.online_sessions --output-format plot --group-by hour db.csv hour.png +``` + +![hour.png] + +You can limit the scope of the database by supplying a time range. +Only online sessions that overlap with this range shall then be processed. +Set the range by specifying both or one of the `--from` and `--to` parameters. +The values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). + +All dates and times are in UTC. + +[matplotlib]: http://matplotlib.org/ +[track_status.py]: track_status.md + +[user.png]: images/user.png +[date.png]: images/date.png +[weekday.png]: images/weekday.png +[hour.png]: images/hour.png + +Known issues +------------ + +* When people go online using the web version and don't visit other pages over +time (for example, just listening to music), they appear offline. +Hence the 0:00:00 durations you might sometimes encounter. +This might also happen using other clients. + +See also +-------- + +* [License] + +[License]: ../README.md#license diff --git a/docs/track_status.md b/docs/track_status.md index be4764a..8fe0746 100644 --- a/docs/track_status.md +++ b/docs/track_status.md @@ -30,11 +30,11 @@ For example (using made up user IDs/"screen names"), By default, the script produces a human-readable log. Use the `--log` parameter to write the log to a file. If you want to record when people go online/offline for further analysis using -[online_duration.py], specify the path to a database using the `--output` +[online_sessions.py], specify the path to a database using the `--output` parameter. Be careful: if the file already exists, it will be overwritten! -[online_duration.py]: online_duration.md +[online_sessions.py]: online_sessions.md See also -------- -- cgit v1.2.3