From 0eabb81f4a8664970ce11405fe3418cbcc1b1672 Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Sun, 19 Jun 2016 03:08:55 +0300 Subject: move files to subdirectories --- README.md | 218 +------------------ bin/mutual_friends.py | 75 +++++++ bin/online_duration.py | 370 +++++++++++++++++++++++++++++++++ bin/track_status.py | 58 ++++++ doc/images/online_duration/date.png | Bin 0 -> 14223 bytes doc/images/online_duration/hour.png | Bin 0 -> 53227 bytes doc/images/online_duration/user.png | Bin 0 -> 20610 bytes doc/images/online_duration/weekday.png | Bin 0 -> 33239 bytes doc/mutual_friends.md | 51 +++++ doc/online_duration.md | 161 ++++++++++++++ doc/track_status.md | 42 ++++ img/online_duration/date.png | Bin 14223 -> 0 bytes img/online_duration/hour.png | Bin 53227 -> 0 bytes img/online_duration/user.png | Bin 20610 -> 0 bytes img/online_duration/weekday.png | Bin 33239 -> 0 bytes mutual_friends.py | 75 ------- online_duration.py | 370 --------------------------------- track_status.py | 58 ------ 18 files changed, 766 insertions(+), 712 deletions(-) create mode 100644 bin/mutual_friends.py create mode 100644 bin/online_duration.py create mode 100644 bin/track_status.py create mode 100644 doc/images/online_duration/date.png create mode 100644 doc/images/online_duration/hour.png create mode 100644 doc/images/online_duration/user.png create mode 100644 doc/images/online_duration/weekday.png create mode 100644 doc/mutual_friends.md create mode 100644 doc/online_duration.md create mode 100644 doc/track_status.md delete mode 100644 img/online_duration/date.png delete mode 100644 img/online_duration/hour.png delete mode 100644 img/online_duration/user.png delete mode 100644 img/online_duration/weekday.png delete mode 100644 mutual_friends.py delete mode 100644 online_duration.py delete mode 100644 track_status.py diff --git a/README.md b/README.md index 48c6103..9d1ea3f 100644 --- a/README.md +++ b/README.md @@ -7,217 +7,17 @@ Requires Python 3.4 or higher. Usage ----- -Pass the `--help` flag to a script to see its detailed usage information. - -### track_status.py - -Track when people go online/offline. - - usage: track_status.py [-h] [-t TIMEOUT] [-l LOG] - [--output-format {csv,log,null}] [-o OUTPUT] - UID [UID ...] - -For example (using made up user IDs/"screen names"), - - > track_status.py john.doe jane.smith - [2016-06-18 01:43:34] John Doe is ONLINE. - [2016-06-18 01:43:34] John Doe was last seen at 2016-06-18 01:33:58+03:00 using the official iPhone app. - [2016-06-18 01:43:34] Jane Smith is OFFLINE. - [2016-06-18 01:43:34] Jane Smith was last seen at 2016-06-18 01:15:47+03:00 using the web version (or an unrecognized app). - [2016-06-18 01:59:09] Jane Smith went ONLINE. - [2016-06-18 01:59:09] Jane Smith was last seen at 2016-06-18 01:59:07+03:00 using the official Android app. - [2016-06-18 02:10:00] John Doe went OFFLINE. - [2016-06-18 02:10:00] John Doe was last seen at 2016-06-18 01:54:58+03:00 using the official iPhone app. - ... - -By default, the script produces a human-readable log. -Use the `--log` parameter to write the log to a file. -If you want to record when people go online/offline for [further analysis], -specify the path to a database using the `--output` parameter. -Be careful: if the file already exists, it will be overwritten! - -[further analysis]: #online_durationpy - -### online_duration.py - -View the amount of time people spent online. - - usage: online_duration.py [-h] [--grouping {user,date,weekday}] - [--input-format {csv,log,null}] - [--output-format {csv,json,img}] - input [output] - -This script additionally requires [matplotlib] to be installed. - -Analyze the database produced by [track_status.py] and calculate the total -amount of time people spent online. - -For example (assuming the database in "db.csv" was generated by -[track_status.py] before): - - > online_duration.py db.csv - 89497105,John,Smith,john.smith,0:12:31 - 3698577,Jane,Smith,jane.smith,1:34:46 - -In the example above, "John Smith" and "Jane Smith" spent approx. 13 and 95 -minutes online respectively. - -The output format is CSV (comma-separated values) by default. -You can also get a JSON document: - - > online_duration.py --output-format json db.csv - [ - { - "uid": 89497105, - "first_name": "John", - "last_name": "Smith", - "screen_name": "john.smith", - "duration": "0:12:31" - }, - { - "uid": 3698577, - "first_name": "Jane", - "last_name": "Smith", - "screen_name": "jane.smith", - "duration": "1:34:46" - } - ] - -The durations are calculated on a per-user basis by default. -You can change that by supplying either `date` (to group by dates), `weekday` -(to group by weekdays) or `hour` (to group by day hours) as the `--grouping` -parameter value. -For example (assuming that both Jane and Joe spent their time online on Friday, -June 17, 2016). - -``` -> online_duration.py --output-format json --grouping date db.csv -[ - { - "date": "2016-06-17", - "duration": "1:47:17" - } -] -``` - -``` -> online_duration.py --output-format csv --grouping weekday db.csv -Monday,0:00:00 -Tuesday,0:00:00 -Wednesday,0:00:00 -Thursday,0:00:00 -Friday,1:47:17 -Saturday,0:00:00 -Sunday,0:00:00 -``` - -``` -> online_duration.py --grouping hour db.csv -0:00:00,0:00:00 -1:00:00,0:00:00 -2:00:00,0:00:00 -3:00:00,0:00:00 -4:00:00,0:03:56 -5:00:00,0:14:14 -6:00:00,0:29:30 -7:00:00,0:31:20 -8:00:00,0:12:04 -9:00:00,0:00:00 -10:00:00,0:00:00 -11:00:00,0:23:14 -12:00:00,0:06:00 -13:00:00,0:46:19 -14:00:00,0:00:00 -15:00:00,0:00:00 -16:00:00,0:00:00 -17:00:00,0:00:00 -18:00:00,0:00:00 -19:00:00,0:00:00 -20:00:00,0:00:00 -21:00:00,0:00:00 -22:00:00,0:00:00 -23:00:00,0:00:00 -``` +The main package is located in the "vk/" directory. -In my opinion, the script's most useful feature is the ability to easily create -plots that represent the text data (like in the examples above). -To produce a plot, pass `img` as the `--output-format` parameter value and add -a file path to write the image to. - - > online_duration.py --output-format img db.csv user.png - -![user.png] - - > online_duration.py --output-format img --grouping date db.csv date.png - -![date.png] - - > online_duration.py --output-format img --grouping weekday db.csv weekday.png - -![weekday.png] - - > online_duration.py --output-format img --grouping hour db.csv hour.png - -![hour.png] - -You can limit the scope of the database by supplying a time range. -Only online durations that are within the supplied range shall then be -processed. -Set the range by specifying both or one of the `--from` and `--to` parameters. -Values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). - -All dates and times are in UTC. - -#### Known issues - -* When people go online using the web version and don't visit other pages over -time (for example, just listening to music), they appear offline. -Hence the 0:00:00 durations you might sometimes encounter. -This might also happen using other clients. - -[matplotlib]: http://matplotlib.org/ -[track_status.py]: #track_statuspy - -[user.png]: img/online_duration/user.png -[date.png]: img/online_duration/date.png -[weekday.png]: img/online_duration/weekday.png -[hour.png]: img/online_duration/hour.png - -### mutual_friends.py - -Learn who your ex and her new boyfriend are both friends with. - - usage: mutual_friends.py [-h] [--output-format {csv,json}] UID [UID ...] - -For example (using made up user IDs/"screen names"), - - > mutual_friends.py john.doe jane.doe - 89497105,John,Smith,john.smith - 3698577,Jane,Smith,jane.smith - -In the example above, both "John Doe" and "Jane Doe" are friends with "John -Smith" and "Jane Smith", whose user IDs are 89497105 and 3698577 respectively. -Their "screen names" (the part after "vk.com/" of their personal page URLs) are -"john.smith" and "jane.smith". - -The output format is CSV (comma-separated values) by default. -You can also get a JSON document: +Additionally, a few scripts are supplied in the "bin/" directory to show-case +the package's capabilities. +Pass the `--help` flag to a script to see its detailed usage information. +The supplied scripts are listed below. - > mutual_friends.py --output-format json john.doe jane.doe - [ - { - "uid": 89497105, - "first_name": "John", - "last_name": "Smith", - "screen_name": "john.smith" - }, - { - "uid": 3698577, - "first_name": "Jane", - "last_name": "Smith", - "screen_name": "jane.smith" - } - ] +* [mutual_friends.py]: Learn who your ex and her new boyfriend are both friends +with. +* [track_status.py]: Track when people go online/offline. +* [online_duration.py]: View/visualize the amount of time people spend online. License ------- diff --git a/bin/mutual_friends.py b/bin/mutual_friends.py new file mode 100644 index 0000000..8823619 --- /dev/null +++ b/bin/mutual_friends.py @@ -0,0 +1,75 @@ +# Copyright 2015 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +from collections import OrderedDict +import csv +from enum import Enum +import json +import sys + +from vk.api import API, Language +from vk.user import UserField + +OUTPUT_FIELDS = UserField.UID, UserField.FIRST_NAME, UserField.LAST_NAME, UserField.SCREEN_NAME + +def query_friend_list(api, user): + return api.friends_get(user.get_uid(), fields=OUTPUT_FIELDS) + +def extract_output_fields(user): + new_user = OrderedDict() + for field in OUTPUT_FIELDS: + new_user[str(field)] = user[field] if field in user else None + return new_user + +def print_mutual_friends_csv(mutual_friends): + writer = csv.writer(sys.stdout, lineterminator='\n') + for user in mutual_friends: + user = extract_output_fields(user) + writer.writerow(user.values()) + +def print_mutual_friends_json(mutual_friends): + print(json.dumps([extract_output_fields(user) for user in mutual_friends], indent=3)) + +def print_mutual_friends(mutual_friends, fmt): + if fmt is OutputFormat.CSV: + print_mutual_friends_csv(mutual_friends) + elif fmt is OutputFormat.JSON: + print_mutual_friends_json(mutual_friends) + else: + raise NotImplementedError('unsupported output format: ' + str(fmt)) + +class OutputFormat(Enum): + CSV = 'csv' + JSON = 'json' + + def __str__(self): + return self.value + +if __name__ == '__main__': + import argparse + + def output_format(s): + try: + return OutputFormat(s) + except ValueError: + raise argparse.ArgumentError() + + parser = argparse.ArgumentParser( + description='Learn who your ex and her new boyfriend are both friends with.') + + parser.add_argument(metavar='UID', dest='uids', nargs='+', + help='user IDs or "screen names"') + parser.add_argument('--output-format', type=output_format, + choices=tuple(fmt for fmt in OutputFormat), + default=OutputFormat.CSV, + help='specify output format') + + args = parser.parse_args() + + api = API(Language.EN) + users = api.users_get(args.uids) + + friend_lists = map(lambda user: frozenset(query_friend_list(api, user)), users) + mutual_friends = frozenset.intersection(*friend_lists) + print_mutual_friends(mutual_friends, args.output_format) diff --git a/bin/online_duration.py b/bin/online_duration.py new file mode 100644 index 0000000..ca6fd66 --- /dev/null +++ b/bin/online_duration.py @@ -0,0 +1,370 @@ +# Copyright 2016 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +import csv +from collections import OrderedDict +from datetime import datetime, timedelta, timezone +from enum import Enum +import json +import sys + +import matplotlib.pyplot as plt +import numpy as np + +from vk.tracking import OnlineStreakEnumerator, Weekday +from vk.tracking.db import Format as DatabaseFormat +from vk.user import UserField + +class Grouping(Enum): + USER = 'user' + DATE = 'date' + WEEKDAY = 'weekday' + HOUR = 'hour' + + def enum_durations(self, db_reader, date_from=None, date_to=None): + if self is Grouping.USER: + return OnlineStreakEnumerator(date_from, date_to).group_by_user(db_reader) + elif self is Grouping.DATE: + return OnlineStreakEnumerator(date_from, date_to).group_by_date(db_reader) + elif self is Grouping.WEEKDAY: + return OnlineStreakEnumerator(date_from, date_to).group_by_weekday(db_reader) + elif self is Grouping.HOUR: + return OnlineStreakEnumerator(date_from, date_to).group_by_hour(db_reader) + else: + raise NotImplementedError('unsupported grouping: ' + str(self)) + + def __str__(self): + return self.value + +_USER_FIELDS = ( + UserField.UID, + UserField.FIRST_NAME, + UserField.LAST_NAME, + UserField.SCREEN_NAME, +) + +class OutputWriterCSV: + def __init__(self, fd=sys.stdout): + self._writer = csv.writer(fd, lineterminator='\n') + + def _user_to_row(user): + return [user[field] for field in _USER_FIELDS] + + def _date_to_row(date): + return [str(date)] + + def _weekday_to_row(weekday): + return [str(weekday)] + + def _hour_to_row(hour): + return [str(timedelta(hours=hour))] + + _CONVERT_KEY_TO_ROW = { + Grouping.USER: _user_to_row, + Grouping.DATE: _date_to_row, + Grouping.WEEKDAY: _weekday_to_row, + Grouping.HOUR: _hour_to_row, + } + + @staticmethod + def _key_to_row(grouping, key): + if grouping not in OutputWriterCSV._CONVERT_KEY_TO_ROW: + raise NotImplementedError('unsupported grouping: ' + str(grouping)) + return OutputWriterCSV._CONVERT_KEY_TO_ROW[grouping](key) + + def process_database(self, grouping, db_reader, date_from=None, date_to=None): + for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items(): + row = self._key_to_row(grouping, key) + row.append(str(duration)) + self._write_row(row) + + def _write_row(self, row): + self._writer.writerow(row) + +_DATE_FIELD = 'date' +_WEEKDAY_FIELD = 'weekday' +_HOUR_FIELD = 'hour' + +class OutputWriterJSON: + def __init__(self, fd=sys.stdout): + self._fd = fd + + def _user_to_object(user): + obj = OrderedDict() + for field in _USER_FIELDS: + obj[str(field)] = user[field] + return obj + + def _date_to_object(date): + obj = OrderedDict() + obj[_DATE_FIELD] = str(date) + return obj + + def _weekday_to_object(weekday): + obj = OrderedDict() + obj[_WEEKDAY_FIELD] = str(weekday) + return obj + + def _hour_to_object(hour): + obj = OrderedDict() + obj[_HOUR_FIELD] = str(timedelta(hours=hour)) + return obj + + _DURATION_FIELD = 'duration' + + _CONVERT_KEY_TO_OBJECT = { + Grouping.USER: _user_to_object, + Grouping.DATE: _date_to_object, + Grouping.WEEKDAY: _weekday_to_object, + Grouping.HOUR: _hour_to_object, + } + + @staticmethod + def _key_to_object(grouping, key): + if not grouping in OutputWriterJSON._CONVERT_KEY_TO_OBJECT: + raise NotImplementedError('unsupported grouping: ' + str(grouping)) + return OutputWriterJSON._CONVERT_KEY_TO_OBJECT[grouping](key) + + def process_database(self, grouping, db_reader, date_from=None, date_to=None): + arr = [] + for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items(): + obj = self._key_to_object(grouping, key) + obj[self._DURATION_FIELD] = str(duration) + arr.append(obj) + self._fd.write(json.dumps(arr, indent=3)) + +class BarChartBuilder: + _BAR_HEIGHT = 1. + + def __init__(self): + self._fig, self._ax = plt.subplots() + + def set_title(self, title): + self._ax.set_title(title) + + def _get_bar_axis(self): + return self._ax.get_yaxis() + + def _get_value_axis(self): + return self._ax.get_xaxis() + + def set_bar_axis_limits(self, start=None, end=None): + self._ax.set_ylim(bottom=start, top=end) + + def set_value_axis_limits(self, start=None, end=None): + self._ax.set_xlim(left=start, right=end) + + def set_value_grid(self): + self._get_value_axis().grid() + + def get_bar_labels(self): + return self._get_bar_axis().get_ticklabels() + + def get_value_labels(self): + return self._get_value_axis().get_ticklabels() + + def set_value_label_formatter(self, fn): + from matplotlib.ticker import FuncFormatter + self._get_value_axis().set_major_formatter(FuncFormatter(fn)) + + def set_integer_values_only(self): + from matplotlib.ticker import MaxNLocator + self._get_value_axis().set_major_locator(MaxNLocator(integer=True)) + + def set_property(self, *args, **kwargs): + plt.setp(*args, **kwargs) + + def _set_size(self, inches, dim=0): + fig_size = self._fig.get_size_inches() + assert len(fig_size) == 2 + fig_size[dim] = inches + self._fig.set_size_inches(fig_size, forward=True) + + def set_width(self, inches): + self._set_size(inches) + + def set_height(self, inches): + self._set_size(inches, dim=1) + + def plot_bars(self, bar_labels, values, datetime_ticks=False): + numof_bars = len(bar_labels) + + if not numof_bars: + self.set_height(1) + self._get_bar_axis().set_tick_params(labelleft=False) + return [] + + self.set_height(numof_bars / 2 if datetime_ticks else numof_bars) + + bar_offsets = np.arange(numof_bars) * 2 * self._BAR_HEIGHT + self._BAR_HEIGHT + bar_axis_min, bar_axis_max = 0, 2 * self._BAR_HEIGHT * numof_bars + + if datetime_ticks: + self._get_bar_axis().set_ticks(bar_offsets - self._BAR_HEIGHT) + else: + self._get_bar_axis().set_ticks(bar_offsets) + + self._get_bar_axis().set_ticklabels(bar_labels) + self.set_bar_axis_limits(bar_axis_min, bar_axis_max) + + return self._ax.barh(bar_offsets, values, align='center', height=self._BAR_HEIGHT) + + def show(self): + plt.show() + + def save(self, path): + self._fig.savefig(path, bbox_inches='tight') + +class PlotBuilder: + def __init__(self, fd=sys.stdout): + self._fd = fd + + def _format_user(user): + return '{}\n{}'.format(user.get_first_name(), user.get_last_name()) + + def _format_date(date): + return str(date) + + def _format_weekday(weekday): + return str(weekday) + + def _format_hour(hour): + return '{}:00'.format(hour) + + _FORMAT_KEY = { + Grouping.USER: _format_user, + Grouping.DATE: _format_date, + Grouping.WEEKDAY: _format_weekday, + Grouping.HOUR: _format_hour, + } + + @staticmethod + def _format_key(grouping, key): + if grouping not in PlotBuilder._FORMAT_KEY: + raise NotImplementedError('unsupported grouping: ' + str(grouping)) + return PlotBuilder._FORMAT_KEY[grouping](key) + + @staticmethod + def _format_duration(seconds, _): + return str(timedelta(seconds=seconds)) + + @staticmethod + def _duration_to_seconds(td): + return td.total_seconds() + + @staticmethod + def _extract_labels(grouping, durations): + return tuple(map(lambda key: PlotBuilder._format_key(grouping, key), durations.keys())) + + @staticmethod + def _extract_values(durations): + return tuple(map(PlotBuilder._duration_to_seconds, durations.values())) + + def process_database(self, grouping, db_reader, date_from=None, date_to=None): + durations = grouping.enum_durations(db_reader, date_from, date_to) + + bar_chart = BarChartBuilder() + + bar_chart.set_title('How much time people spend online') + bar_chart.set_value_grid() + + bar_chart.set_integer_values_only() + bar_chart.set_property(bar_chart.get_value_labels(), + fontsize='small', rotation=30) + bar_chart.set_value_label_formatter(self._format_duration) + + labels = self._extract_labels(grouping, durations) + durations = self._extract_values(durations) + + if not labels or not max(durations): + bar_chart.set_value_axis_limits(0) + + bars = bar_chart.plot_bars(labels, durations, grouping is Grouping.HOUR) + bar_chart.set_property(bars, alpha=.33) + + if self._fd is sys.stdout: + bar_chart.show() + else: + bar_chart.save(self._fd) + +class OutputFormat(Enum): + CSV = 'csv' + JSON = 'json' + IMG = 'img' + + def create_writer(self, fd): + if self is OutputFormat.CSV: + return OutputWriterCSV(fd) + elif self is OutputFormat.JSON: + return OutputWriterJSON(fd) + elif self is OutputFormat.IMG: + return PlotBuilder(fd) + else: + raise NotImplementedError('unsupported output format: ' + str(self)) + + def __str__(self): + return self.value + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser( + description='View/visualize the amount of time people spend online.') + + def grouping(s): + try: + return Grouping(s) + except ValueError: + raise argparse.ArgumentError() + def database_format(s): + try: + return DatabaseFormat(s) + except ValueError: + raise argparse.ArgumentError() + def output_format(s): + try: + return OutputFormat(s) + except ValueError: + raise argparse.ArgumentError() + def date_range_limit(s): + try: + return datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) + except ValueError: + raise argparse.ArgumentError() + + parser.add_argument('input', type=argparse.FileType('r'), + help='database path') + parser.add_argument('output', type=argparse.FileType('w'), + nargs='?', default=sys.stdout, + help='output path (standard output by default)') + parser.add_argument('--grouping', type=grouping, + choices=tuple(grouping for grouping in Grouping), + default=Grouping.USER, + help='set grouping') + parser.add_argument('--input-format', type=database_format, + choices=tuple(fmt for fmt in DatabaseFormat), + default=DatabaseFormat.CSV, + help='specify database format') + parser.add_argument('--output-format', type=output_format, + choices=tuple(fmt for fmt in OutputFormat), + default=OutputFormat.CSV, + help='specify output format') + parser.add_argument('--from', type=date_range_limit, default=None, + dest='date_from', + help='set the date to process database records from') + parser.add_argument('--to', type=date_range_limit, default=None, + dest='date_to', + help='set the date to process database record to') + + args = parser.parse_args() + + if args.date_from is not None and args.date_to is not None: + if args.date_from > args.date_to: + args.date_from, args.date_to = args.date_to, args.date_from + + with args.input_format.create_reader(args.input) as db_reader: + output_writer = args.output_format.create_writer(args.output) + output_writer.process_database( + args.grouping, db_reader, date_from=args.date_from, + date_to=args.date_to) diff --git a/bin/track_status.py b/bin/track_status.py new file mode 100644 index 0000000..f18d908 --- /dev/null +++ b/bin/track_status.py @@ -0,0 +1,58 @@ +# Copyright 2015 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +from vk.api import API, Language +from vk.tracking import StatusTracker +from vk.tracking.db import Format + +if __name__ == '__main__': + import argparse, sys + + def natural_number(s): + x = int(s) + if x < 1: + raise argparse.ArgumentError() + return x + def output_format(s): + try: + return Format(s) + except ValueError: + raise argparse.ArgumentError() + + parser = argparse.ArgumentParser( + description='Track when people go online/offline.') + + parser.add_argument(metavar='UID', dest='uids', nargs='+', + help='user IDs or "screen names"') + parser.add_argument('-t', '--timeout', type=natural_number, + default=StatusTracker.DEFAULT_TIMEOUT, + help='set refresh interval (seconds)') + parser.add_argument('-l', '--log', default=sys.stdout, + type=argparse.FileType('w'), + help='set log file path (standard output by default)') + parser.add_argument('--output-format', + type=output_format, default=Format.CSV, + choices=tuple(fmt for fmt in Format), + help='set database format') + parser.add_argument('-o', '--output', default=None, + type=argparse.FileType('w'), + help='set database file path') + + args = parser.parse_args() + + api = API(Language.EN) + tracker = StatusTracker(api, args.timeout) + + if args.output_format is Format.LOG or args.output is None: + args.output_format = Format.NULL + + with Format.LOG.create_writer(args.log) as log_writer: + tracker.add_database_writer(log_writer) + with args.output_format.create_writer(args.output) as db_writer: + tracker.add_database_writer(db_writer) + try: + tracker.loop(args.uids) + except Exception as e: + log_writer.exception(e) + sys.exit(1) diff --git a/doc/images/online_duration/date.png b/doc/images/online_duration/date.png new file mode 100644 index 0000000..477c530 Binary files /dev/null and b/doc/images/online_duration/date.png differ diff --git a/doc/images/online_duration/hour.png b/doc/images/online_duration/hour.png new file mode 100644 index 0000000..cad26b3 Binary files /dev/null and b/doc/images/online_duration/hour.png differ diff --git a/doc/images/online_duration/user.png b/doc/images/online_duration/user.png new file mode 100644 index 0000000..13a7420 Binary files /dev/null and b/doc/images/online_duration/user.png differ diff --git a/doc/images/online_duration/weekday.png b/doc/images/online_duration/weekday.png new file mode 100644 index 0000000..5cf659e Binary files /dev/null and b/doc/images/online_duration/weekday.png differ diff --git a/doc/mutual_friends.md b/doc/mutual_friends.md new file mode 100644 index 0000000..cc9396e --- /dev/null +++ b/doc/mutual_friends.md @@ -0,0 +1,51 @@ +mutual_friends.py +================= + +Learn who your ex and her new boyfriend are both friends with. + +Usage +----- + +Run from the top-level directory using `python -m`. +For example: + + > python -m bin.mutual_friends -h + usage: mutual_friends.py [-h] [--output-format {csv,json}] UID [UID ...] + ... + +For example (using made up user IDs/"screen names"), + + > python -m bin.mutual_friends john.doe jane.doe + 89497105,John,Smith,john.smith + 3698577,Jane,Smith,jane.smith + +In the example above, both "John Doe" and "Jane Doe" are friends with "John +Smith" and "Jane Smith", whose user IDs are 89497105 and 3698577 respectively. +Their "screen names" (the part after "vk.com/" of their personal page URLs) are +"john.smith" and "jane.smith". + +The output format is CSV (comma-separated values) by default. +You can also get a JSON document: + + > python -m bin.mutual_friends --output-format json john.doe jane.doe + [ + { + "uid": 89497105, + "first_name": "John", + "last_name": "Smith", + "screen_name": "john.smith" + }, + { + "uid": 3698577, + "first_name": "Jane", + "last_name": "Smith", + "screen_name": "jane.smith" + } + ] + +See also +-------- + +* [License] + +[License]: ../README.md#license diff --git a/doc/online_duration.md b/doc/online_duration.md new file mode 100644 index 0000000..22af4b1 --- /dev/null +++ b/doc/online_duration.md @@ -0,0 +1,161 @@ +online_duration.py +================== + +View/visualize the amount of time people spend online. + +Usage +----- + +Run from the top-level directory using `python -m`. +For example: + + > python -m bin.online_duration -h + usage: online_duration.py [-h] [--grouping {user,date,weekday,hour}] + [--input-format {csv,log,null}] + [--output-format {csv,json,img}] [--from DATE_FROM] + [--to DATE_TO] + input [output] + +This script additionally requires [matplotlib] to be installed. + +Analyze the database produced by [track_status.py] and calculate the total +amount of time people spent online. + +For example (assuming the database in "db.csv" was generated by +[track_status.py] before): + + > python -m bin.online_duration db.csv + 89497105,John,Smith,john.smith,0:12:31 + 3698577,Jane,Smith,jane.smith,1:34:46 + +In the example above, "John Smith" and "Jane Smith" spent approx. 13 and 95 +minutes online respectively. + +The output format is CSV (comma-separated values) by default. +You can also get a JSON document: + + > python -m bin.online_duration --output-format json db.csv + [ + { + "uid": 89497105, + "first_name": "John", + "last_name": "Smith", + "screen_name": "john.smith", + "duration": "0:12:31" + }, + { + "uid": 3698577, + "first_name": "Jane", + "last_name": "Smith", + "screen_name": "jane.smith", + "duration": "1:34:46" + } + ] + +The durations are calculated on a per-user basis by default. +You can change that by supplying either `date` (to group by dates), `weekday` +(to group by weekdays) or `hour` (to group by day hours) as the `--grouping` +parameter value. +For example (assuming that both Jane and Joe spent their time online on Friday, +June 17, 2016). + +``` +> python -m bin.online_duration --output-format json --grouping date db.csv +[ + { + "date": "2016-06-17", + "duration": "1:47:17" + } +] +``` + +``` +> python -m bin.online_duration --output-format csv --grouping weekday db.csv +Monday,0:00:00 +Tuesday,0:00:00 +Wednesday,0:00:00 +Thursday,0:00:00 +Friday,1:47:17 +Saturday,0:00:00 +Sunday,0:00:00 +``` + +``` +> python -m bin.online_duration --grouping hour db.csv +0:00:00,0:00:00 +1:00:00,0:00:00 +2:00:00,0:00:00 +3:00:00,0:00:00 +4:00:00,0:03:56 +5:00:00,0:14:14 +6:00:00,0:29:30 +7:00:00,0:31:20 +8:00:00,0:12:04 +9:00:00,0:00:00 +10:00:00,0:00:00 +11:00:00,0:23:14 +12:00:00,0:06:00 +13:00:00,0:46:19 +14:00:00,0:00:00 +15:00:00,0:00:00 +16:00:00,0:00:00 +17:00:00,0:00:00 +18:00:00,0:00:00 +19:00:00,0:00:00 +20:00:00,0:00:00 +21:00:00,0:00:00 +22:00:00,0:00:00 +23:00:00,0:00:00 +``` + +In my opinion, the script's most useful feature is the ability to easily create +plots that represent the text data (like in the examples above). +To produce a plot, pass `img` as the `--output-format` parameter value and add +a file path to write the image to. + + > python -m bin.online_duration --output-format img db.csv user.png + +![user.png] + + > python -m bin.online_duration --output-format img --grouping date db.csv date.png + +![date.png] + + > python -m bin.online_duration --output-format img --grouping weekday db.csv weekday.png + +![weekday.png] + + > python -m bin.online_duration --output-format img --grouping hour db.csv hour.png + +![hour.png] + +You can limit the scope of the database by supplying a time range. +Only online durations that are within the supplied range shall then be +processed. +Set the range by specifying both or one of the `--from` and `--to` parameters. +Values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601). + +All dates and times are in UTC. + +[matplotlib]: http://matplotlib.org/ +[track_status.py]: track_status.md + +[user.png]: images/user.png +[date.png]: images/date.png +[weekday.png]: images/weekday.png +[hour.png]: images/hour.png + +Known issues +------------ + +* When people go online using the web version and don't visit other pages over +time (for example, just listening to music), they appear offline. +Hence the 0:00:00 durations you might sometimes encounter. +This might also happen using other clients. + +See also +-------- + +* [License] + +[License]: ../README.md#license diff --git a/doc/track_status.md b/doc/track_status.md new file mode 100644 index 0000000..8bbb380 --- /dev/null +++ b/doc/track_status.md @@ -0,0 +1,42 @@ +track_status.py +=============== + +Track when people go online/offline. + +Usage +----- + +Run from the top-level directory using `python -m`. +For example: + + > python -m bin.track_status -h + usage: track_status.py [-h] [-t TIMEOUT] [-l LOG] + [--output-format {csv,log,null}] [-o OUTPUT] + UID [UID ...] + ... + +For example (using made up user IDs/"screen names"), + + > track_status.py john.doe jane.smith + [2016-06-18 01:43:34] John Doe is ONLINE. + [2016-06-18 01:43:34] John Doe was last seen at 2016-06-18 01:33:58+03:00 using the official iPhone app. + [2016-06-18 01:43:34] Jane Smith is OFFLINE. + [2016-06-18 01:43:34] Jane Smith was last seen at 2016-06-18 01:15:47+03:00 using the web version (or an unrecognized app). + [2016-06-18 01:59:09] Jane Smith went ONLINE. + [2016-06-18 01:59:09] Jane Smith was last seen at 2016-06-18 01:59:07+03:00 using the official Android app. + [2016-06-18 02:10:00] John Doe went OFFLINE. + [2016-06-18 02:10:00] John Doe was last seen at 2016-06-18 01:54:58+03:00 using the official iPhone app. + ... + +By default, the script produces a human-readable log. +Use the `--log` parameter to write the log to a file. +If you want to record when people go online/offline for [further analysis], +specify the path to a database using the `--output` parameter. +Be careful: if the file already exists, it will be overwritten! + +See also +-------- + +* [License] + +[License]: ../README.md#license diff --git a/img/online_duration/date.png b/img/online_duration/date.png deleted file mode 100644 index 477c530..0000000 Binary files a/img/online_duration/date.png and /dev/null differ diff --git a/img/online_duration/hour.png b/img/online_duration/hour.png deleted file mode 100644 index cad26b3..0000000 Binary files a/img/online_duration/hour.png and /dev/null differ diff --git a/img/online_duration/user.png b/img/online_duration/user.png deleted file mode 100644 index 13a7420..0000000 Binary files a/img/online_duration/user.png and /dev/null differ diff --git a/img/online_duration/weekday.png b/img/online_duration/weekday.png deleted file mode 100644 index 5cf659e..0000000 Binary files a/img/online_duration/weekday.png and /dev/null differ diff --git a/mutual_friends.py b/mutual_friends.py deleted file mode 100644 index 8823619..0000000 --- a/mutual_friends.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright 2015 Egor Tensin -# This file is licensed under the terms of the MIT License. -# See LICENSE.txt for details. - -from collections import OrderedDict -import csv -from enum import Enum -import json -import sys - -from vk.api import API, Language -from vk.user import UserField - -OUTPUT_FIELDS = UserField.UID, UserField.FIRST_NAME, UserField.LAST_NAME, UserField.SCREEN_NAME - -def query_friend_list(api, user): - return api.friends_get(user.get_uid(), fields=OUTPUT_FIELDS) - -def extract_output_fields(user): - new_user = OrderedDict() - for field in OUTPUT_FIELDS: - new_user[str(field)] = user[field] if field in user else None - return new_user - -def print_mutual_friends_csv(mutual_friends): - writer = csv.writer(sys.stdout, lineterminator='\n') - for user in mutual_friends: - user = extract_output_fields(user) - writer.writerow(user.values()) - -def print_mutual_friends_json(mutual_friends): - print(json.dumps([extract_output_fields(user) for user in mutual_friends], indent=3)) - -def print_mutual_friends(mutual_friends, fmt): - if fmt is OutputFormat.CSV: - print_mutual_friends_csv(mutual_friends) - elif fmt is OutputFormat.JSON: - print_mutual_friends_json(mutual_friends) - else: - raise NotImplementedError('unsupported output format: ' + str(fmt)) - -class OutputFormat(Enum): - CSV = 'csv' - JSON = 'json' - - def __str__(self): - return self.value - -if __name__ == '__main__': - import argparse - - def output_format(s): - try: - return OutputFormat(s) - except ValueError: - raise argparse.ArgumentError() - - parser = argparse.ArgumentParser( - description='Learn who your ex and her new boyfriend are both friends with.') - - parser.add_argument(metavar='UID', dest='uids', nargs='+', - help='user IDs or "screen names"') - parser.add_argument('--output-format', type=output_format, - choices=tuple(fmt for fmt in OutputFormat), - default=OutputFormat.CSV, - help='specify output format') - - args = parser.parse_args() - - api = API(Language.EN) - users = api.users_get(args.uids) - - friend_lists = map(lambda user: frozenset(query_friend_list(api, user)), users) - mutual_friends = frozenset.intersection(*friend_lists) - print_mutual_friends(mutual_friends, args.output_format) diff --git a/online_duration.py b/online_duration.py deleted file mode 100644 index d2d9e34..0000000 --- a/online_duration.py +++ /dev/null @@ -1,370 +0,0 @@ -# Copyright 2016 Egor Tensin -# This file is licensed under the terms of the MIT License. -# See LICENSE.txt for details. - -import csv -from collections import OrderedDict -from datetime import datetime, timedelta, timezone -from enum import Enum -import json -import sys - -import matplotlib.pyplot as plt -import numpy as np - -from vk.tracking import OnlineStreakEnumerator, Weekday -from vk.tracking.db import Format as DatabaseFormat -from vk.user import UserField - -class Grouping(Enum): - USER = 'user' - DATE = 'date' - WEEKDAY = 'weekday' - HOUR = 'hour' - - def enum_durations(self, db_reader, date_from=None, date_to=None): - if self is Grouping.USER: - return OnlineStreakEnumerator(date_from, date_to).group_by_user(db_reader) - elif self is Grouping.DATE: - return OnlineStreakEnumerator(date_from, date_to).group_by_date(db_reader) - elif self is Grouping.WEEKDAY: - return OnlineStreakEnumerator(date_from, date_to).group_by_weekday(db_reader) - elif self is Grouping.HOUR: - return OnlineStreakEnumerator(date_from, date_to).group_by_hour(db_reader) - else: - raise NotImplementedError('unsupported grouping: ' + str(self)) - - def __str__(self): - return self.value - -_USER_FIELDS = ( - UserField.UID, - UserField.FIRST_NAME, - UserField.LAST_NAME, - UserField.SCREEN_NAME, -) - -class OutputWriterCSV: - def __init__(self, fd=sys.stdout): - self._writer = csv.writer(fd, lineterminator='\n') - - def _user_to_row(user): - return [user[field] for field in _USER_FIELDS] - - def _date_to_row(date): - return [str(date)] - - def _weekday_to_row(weekday): - return [str(weekday)] - - def _hour_to_row(hour): - return [str(timedelta(hours=hour))] - - _CONVERT_KEY_TO_ROW = { - Grouping.USER: _user_to_row, - Grouping.DATE: _date_to_row, - Grouping.WEEKDAY: _weekday_to_row, - Grouping.HOUR: _hour_to_row, - } - - @staticmethod - def _key_to_row(grouping, key): - if grouping not in OutputWriterCSV._CONVERT_KEY_TO_ROW: - raise NotImplementedError('unsupported grouping: ' + str(grouping)) - return OutputWriterCSV._CONVERT_KEY_TO_ROW[grouping](key) - - def process_database(self, grouping, db_reader, date_from=None, date_to=None): - for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items(): - row = self._key_to_row(grouping, key) - row.append(str(duration)) - self._write_row(row) - - def _write_row(self, row): - self._writer.writerow(row) - -_DATE_FIELD = 'date' -_WEEKDAY_FIELD = 'weekday' -_HOUR_FIELD = 'hour' - -class OutputWriterJSON: - def __init__(self, fd=sys.stdout): - self._fd = fd - - def _user_to_object(user): - obj = OrderedDict() - for field in _USER_FIELDS: - obj[str(field)] = user[field] - return obj - - def _date_to_object(date): - obj = OrderedDict() - obj[_DATE_FIELD] = str(date) - return obj - - def _weekday_to_object(weekday): - obj = OrderedDict() - obj[_WEEKDAY_FIELD] = str(weekday) - return obj - - def _hour_to_object(hour): - obj = OrderedDict() - obj[_HOUR_FIELD] = str(timedelta(hours=hour)) - return obj - - _DURATION_FIELD = 'duration' - - _CONVERT_KEY_TO_OBJECT = { - Grouping.USER: _user_to_object, - Grouping.DATE: _date_to_object, - Grouping.WEEKDAY: _weekday_to_object, - Grouping.HOUR: _hour_to_object, - } - - @staticmethod - def _key_to_object(grouping, key): - if not grouping in OutputWriterJSON._CONVERT_KEY_TO_OBJECT: - raise NotImplementedError('unsupported grouping: ' + str(grouping)) - return OutputWriterJSON._CONVERT_KEY_TO_OBJECT[grouping](key) - - def process_database(self, grouping, db_reader, date_from=None, date_to=None): - arr = [] - for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items(): - obj = self._key_to_object(grouping, key) - obj[self._DURATION_FIELD] = str(duration) - arr.append(obj) - self._fd.write(json.dumps(arr, indent=3)) - -class BarChartBuilder: - _BAR_HEIGHT = 1. - - def __init__(self): - self._fig, self._ax = plt.subplots() - - def set_title(self, title): - self._ax.set_title(title) - - def _get_bar_axis(self): - return self._ax.get_yaxis() - - def _get_value_axis(self): - return self._ax.get_xaxis() - - def set_bar_axis_limits(self, start=None, end=None): - self._ax.set_ylim(bottom=start, top=end) - - def set_value_axis_limits(self, start=None, end=None): - self._ax.set_xlim(left=start, right=end) - - def set_value_grid(self): - self._get_value_axis().grid() - - def get_bar_labels(self): - return self._get_bar_axis().get_ticklabels() - - def get_value_labels(self): - return self._get_value_axis().get_ticklabels() - - def set_value_label_formatter(self, fn): - from matplotlib.ticker import FuncFormatter - self._get_value_axis().set_major_formatter(FuncFormatter(fn)) - - def set_integer_values_only(self): - from matplotlib.ticker import MaxNLocator - self._get_value_axis().set_major_locator(MaxNLocator(integer=True)) - - def set_property(self, *args, **kwargs): - plt.setp(*args, **kwargs) - - def _set_size(self, inches, dim=0): - fig_size = self._fig.get_size_inches() - assert len(fig_size) == 2 - fig_size[dim] = inches - self._fig.set_size_inches(fig_size, forward=True) - - def set_width(self, inches): - self._set_size(inches) - - def set_height(self, inches): - self._set_size(inches, dim=1) - - def plot_bars(self, bar_labels, values, datetime_ticks=False): - numof_bars = len(bar_labels) - - if not numof_bars: - self.set_height(1) - self._get_bar_axis().set_tick_params(labelleft=False) - return [] - - self.set_height(numof_bars / 2 if datetime_ticks else numof_bars) - - bar_offsets = np.arange(numof_bars) * 2 * self._BAR_HEIGHT + self._BAR_HEIGHT - bar_axis_min, bar_axis_max = 0, 2 * self._BAR_HEIGHT * numof_bars - - if datetime_ticks: - self._get_bar_axis().set_ticks(bar_offsets - self._BAR_HEIGHT) - else: - self._get_bar_axis().set_ticks(bar_offsets) - - self._get_bar_axis().set_ticklabels(bar_labels) - self.set_bar_axis_limits(bar_axis_min, bar_axis_max) - - return self._ax.barh(bar_offsets, values, align='center', height=self._BAR_HEIGHT) - - def show(self): - plt.show() - - def save(self, path): - self._fig.savefig(path, bbox_inches='tight') - -class PlotBuilder: - def __init__(self, fd=sys.stdout): - self._fd = fd - - def _format_user(user): - return '{}\n{}'.format(user.get_first_name(), user.get_last_name()) - - def _format_date(date): - return str(date) - - def _format_weekday(weekday): - return str(weekday) - - def _format_hour(hour): - return '{}:00'.format(hour) - - _FORMAT_KEY = { - Grouping.USER: _format_user, - Grouping.DATE: _format_date, - Grouping.WEEKDAY: _format_weekday, - Grouping.HOUR: _format_hour, - } - - @staticmethod - def _format_key(grouping, key): - if grouping not in PlotBuilder._FORMAT_KEY: - raise NotImplementedError('unsupported grouping: ' + str(grouping)) - return PlotBuilder._FORMAT_KEY[grouping](key) - - @staticmethod - def _format_duration(seconds, _): - return str(timedelta(seconds=seconds)) - - @staticmethod - def _duration_to_seconds(td): - return td.total_seconds() - - @staticmethod - def _extract_labels(grouping, durations): - return tuple(map(lambda key: PlotBuilder._format_key(grouping, key), durations.keys())) - - @staticmethod - def _extract_values(durations): - return tuple(map(PlotBuilder._duration_to_seconds, durations.values())) - - def process_database(self, grouping, db_reader, date_from=None, date_to=None): - durations = grouping.enum_durations(db_reader, date_from, date_to) - - bar_chart = BarChartBuilder() - - bar_chart.set_title('How much time people spend online') - bar_chart.set_value_grid() - - bar_chart.set_integer_values_only() - bar_chart.set_property(bar_chart.get_value_labels(), - fontsize='small', rotation=30) - bar_chart.set_value_label_formatter(self._format_duration) - - labels = self._extract_labels(grouping, durations) - durations = self._extract_values(durations) - - if not labels or not max(durations): - bar_chart.set_value_axis_limits(0) - - bars = bar_chart.plot_bars(labels, durations, grouping is Grouping.HOUR) - bar_chart.set_property(bars, alpha=.33) - - if self._fd is sys.stdout: - bar_chart.show() - else: - bar_chart.save(self._fd) - -class OutputFormat(Enum): - CSV = 'csv' - JSON = 'json' - IMG = 'img' - - def create_writer(self, fd): - if self is OutputFormat.CSV: - return OutputWriterCSV(fd) - elif self is OutputFormat.JSON: - return OutputWriterJSON(fd) - elif self is OutputFormat.IMG: - return PlotBuilder(fd) - else: - raise NotImplementedError('unsupported output format: ' + str(self)) - - def __str__(self): - return self.value - -if __name__ == '__main__': - import argparse - - parser = argparse.ArgumentParser( - description='View the amount of time people spent online.') - - def grouping(s): - try: - return Grouping(s) - except ValueError: - raise argparse.ArgumentError() - def database_format(s): - try: - return DatabaseFormat(s) - except ValueError: - raise argparse.ArgumentError() - def output_format(s): - try: - return OutputFormat(s) - except ValueError: - raise argparse.ArgumentError() - def date_range_limit(s): - try: - return datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) - except ValueError: - raise argparse.ArgumentError() - - parser.add_argument('input', type=argparse.FileType('r'), - help='database path') - parser.add_argument('output', type=argparse.FileType('w'), - nargs='?', default=sys.stdout, - help='output path (standard output by default)') - parser.add_argument('--grouping', type=grouping, - choices=tuple(grouping for grouping in Grouping), - default=Grouping.USER, - help='set grouping') - parser.add_argument('--input-format', type=database_format, - choices=tuple(fmt for fmt in DatabaseFormat), - default=DatabaseFormat.CSV, - help='specify database format') - parser.add_argument('--output-format', type=output_format, - choices=tuple(fmt for fmt in OutputFormat), - default=OutputFormat.CSV, - help='specify output format') - parser.add_argument('--from', type=date_range_limit, default=None, - dest='date_from', - help='set the date to process database records from') - parser.add_argument('--to', type=date_range_limit, default=None, - dest='date_to', - help='set the date to process database record to') - - args = parser.parse_args() - - if args.date_from is not None and args.date_to is not None: - if args.date_from > args.date_to: - args.date_from, args.date_to = args.date_to, args.date_from - - with args.input_format.create_reader(args.input) as db_reader: - output_writer = args.output_format.create_writer(args.output) - output_writer.process_database( - args.grouping, db_reader, date_from=args.date_from, - date_to=args.date_to) diff --git a/track_status.py b/track_status.py deleted file mode 100644 index f18d908..0000000 --- a/track_status.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2015 Egor Tensin -# This file is licensed under the terms of the MIT License. -# See LICENSE.txt for details. - -from vk.api import API, Language -from vk.tracking import StatusTracker -from vk.tracking.db import Format - -if __name__ == '__main__': - import argparse, sys - - def natural_number(s): - x = int(s) - if x < 1: - raise argparse.ArgumentError() - return x - def output_format(s): - try: - return Format(s) - except ValueError: - raise argparse.ArgumentError() - - parser = argparse.ArgumentParser( - description='Track when people go online/offline.') - - parser.add_argument(metavar='UID', dest='uids', nargs='+', - help='user IDs or "screen names"') - parser.add_argument('-t', '--timeout', type=natural_number, - default=StatusTracker.DEFAULT_TIMEOUT, - help='set refresh interval (seconds)') - parser.add_argument('-l', '--log', default=sys.stdout, - type=argparse.FileType('w'), - help='set log file path (standard output by default)') - parser.add_argument('--output-format', - type=output_format, default=Format.CSV, - choices=tuple(fmt for fmt in Format), - help='set database format') - parser.add_argument('-o', '--output', default=None, - type=argparse.FileType('w'), - help='set database file path') - - args = parser.parse_args() - - api = API(Language.EN) - tracker = StatusTracker(api, args.timeout) - - if args.output_format is Format.LOG or args.output is None: - args.output_format = Format.NULL - - with Format.LOG.create_writer(args.log) as log_writer: - tracker.add_database_writer(log_writer) - with args.output_format.create_writer(args.output) as db_writer: - tracker.add_database_writer(db_writer) - try: - tracker.loop(args.uids) - except Exception as e: - log_writer.exception(e) - sys.exit(1) -- cgit v1.2.3