aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorEgor Tensin <Egor.Tensin@gmail.com>2016-06-18 17:13:44 +0300
committerEgor Tensin <Egor.Tensin@gmail.com>2016-06-18 17:13:44 +0300
commit768af3bea05346d3b412351f70a95454535a8955 (patch)
tree1eb40f573311df2c2ee7a319cfc41711816a50bb
parentREADME update (diff)
downloadvk-scripts-768af3bea05346d3b412351f70a95454535a8955.tar.gz
vk-scripts-768af3bea05346d3b412351f70a95454535a8955.zip
implement grouping by hour of the day
-rw-r--r--README.md46
-rw-r--r--img/online_duration/hour.pngbin0 -> 53227 bytes
-rw-r--r--online_duration.py71
-rw-r--r--vk/utils/tracking/online_streaks.py52
4 files changed, 145 insertions, 24 deletions
diff --git a/README.md b/README.md
index 50202de..06f137e 100644
--- a/README.md
+++ b/README.md
@@ -84,8 +84,9 @@ You can also get a JSON document:
]
The durations are calculated on a per-user basis by default.
-You can change that by supplying either `date` (to group by dates) or `weekday`
-(to group by weekdays) as the `--grouping` parameter value.
+You can change that by supplying either `date` (to group by dates), `weekday`
+(to group by weekdays) or `hour` (to group by day hours) as the `--grouping`
+parameter value.
For example (assuming that both Jane and Joe spent their time online on Friday,
June 17, 2016).
@@ -110,6 +111,34 @@ Saturday,0:00:00
Sunday,0:00:00
```
+```
+> online_duration.py --grouping hour db.csv
+0:00:00,0:00:00
+1:00:00,0:00:00
+2:00:00,0:00:00
+3:00:00,0:00:00
+4:00:00,0:03:56
+5:00:00,0:14:14
+6:00:00,0:29:30
+7:00:00,0:31:20
+8:00:00,0:12:04
+9:00:00,0:00:00
+10:00:00,0:00:00
+11:00:00,0:23:14
+12:00:00,0:06:00
+13:00:00,0:46:19
+14:00:00,0:00:00
+15:00:00,0:00:00
+16:00:00,0:00:00
+17:00:00,0:00:00
+18:00:00,0:00:00
+19:00:00,0:00:00
+20:00:00,0:00:00
+21:00:00,0:00:00
+22:00:00,0:00:00
+23:00:00,0:00:00
+```
+
In my opinion, the script's most useful feature is the ability to easily create
plots that represent the text data (like in the examples above).
To produce a plot, pass `img` as the `--output-format` parameter value and add
@@ -127,12 +156,25 @@ a file path to write the image to.
![weekday.png]
+ > online_duration.py --output-format img --grouping hour db.csv hour.png
+
+![hour.png]
+
+You can limit the scope of the database by supplying a time range.
+Only online durations that are within the supplied range shall then be
+processed.
+Set the range by specifying both or one of the `--from` and `--to` parameters.
+Values must be in the `%Y-%m-%dT%H:%M:%SZ` format (a subset of ISO 8601).
+
+All dates and times are in UTC.
+
[matplotlib]: http://matplotlib.org/
[track_status.py]: #track_statuspy
[user.png]: img/online_duration/user.png
[date.png]: img/online_duration/date.png
[weekday.png]: img/online_duration/weekday.png
+[hour.png]: img/online_duration/hour.png
### mutual_friends.py
diff --git a/img/online_duration/hour.png b/img/online_duration/hour.png
new file mode 100644
index 0000000..cad26b3
--- /dev/null
+++ b/img/online_duration/hour.png
Binary files differ
diff --git a/online_duration.py b/online_duration.py
index 637b0ec..68f716a 100644
--- a/online_duration.py
+++ b/online_duration.py
@@ -4,7 +4,7 @@
import csv
from collections import OrderedDict
-from datetime import timedelta
+from datetime import datetime, timedelta, timezone
from enum import Enum
import json
import sys
@@ -20,14 +20,17 @@ class Grouping(Enum):
USER = 'user'
DATE = 'date'
WEEKDAY = 'weekday'
+ HOUR = 'hour'
- def enum_durations(self, db_reader):
+ def enum_durations(self, db_reader, date_from=None, date_to=None):
if self is Grouping.USER:
- return OnlineStreakEnumerator().group_by_user(db_reader)
+ return OnlineStreakEnumerator(date_from, date_to).group_by_user(db_reader)
elif self is Grouping.DATE:
- return OnlineStreakEnumerator().group_by_date(db_reader)
+ return OnlineStreakEnumerator(date_from, date_to).group_by_date(db_reader)
elif self is Grouping.WEEKDAY:
- return OnlineStreakEnumerator().group_by_weekday(db_reader)
+ return OnlineStreakEnumerator(date_from, date_to).group_by_weekday(db_reader)
+ elif self is Grouping.HOUR:
+ return OnlineStreakEnumerator(date_from, date_to).group_by_hour(db_reader)
else:
raise NotImplementedError('unsupported grouping: ' + str(self))
@@ -54,10 +57,14 @@ class OutputWriterCSV:
def _weekday_to_row(weekday):
return [str(weekday)]
+ def _hour_to_row(hour):
+ return [str(timedelta(hours=hour))]
+
_CONVERT_KEY_TO_ROW = {
Grouping.USER: _user_to_row,
Grouping.DATE: _date_to_row,
Grouping.WEEKDAY: _weekday_to_row,
+ Grouping.HOUR: _hour_to_row,
}
@staticmethod
@@ -66,8 +73,8 @@ class OutputWriterCSV:
raise NotImplementedError('unsupported grouping: ' + str(grouping))
return OutputWriterCSV._CONVERT_KEY_TO_ROW[grouping](key)
- def process_database(self, grouping, db_reader):
- for key, duration in grouping.enum_durations(db_reader).items():
+ def process_database(self, grouping, db_reader, date_from=None, date_to=None):
+ for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items():
row = self._key_to_row(grouping, key)
row.append(str(duration))
self._write_row(row)
@@ -77,6 +84,7 @@ class OutputWriterCSV:
_DATE_FIELD = 'date'
_WEEKDAY_FIELD = 'weekday'
+_HOUR_FIELD = 'hour'
class OutputWriterJSON:
def __init__(self, fd=sys.stdout):
@@ -98,12 +106,18 @@ class OutputWriterJSON:
obj[_WEEKDAY_FIELD] = str(weekday)
return obj
+ def _hour_to_object(hour):
+ obj = OrderedDict()
+ obj[_HOUR_FIELD] = str(timedelta(hours=hour))
+ return obj
+
_DURATION_FIELD = 'duration'
_CONVERT_KEY_TO_OBJECT = {
Grouping.USER: _user_to_object,
Grouping.DATE: _date_to_object,
Grouping.WEEKDAY: _weekday_to_object,
+ Grouping.HOUR: _hour_to_object,
}
@staticmethod
@@ -112,9 +126,9 @@ class OutputWriterJSON:
raise NotImplementedError('unsupported grouping: ' + str(grouping))
return OutputWriterJSON._CONVERT_KEY_TO_OBJECT[grouping](key)
- def process_database(self, grouping, db_reader):
+ def process_database(self, grouping, db_reader, date_from=None, date_to=None):
arr = []
- for key, duration in grouping.enum_durations(db_reader).items():
+ for key, duration in grouping.enum_durations(db_reader, date_from, date_to).items():
obj = self._key_to_object(grouping, key)
obj[self._DURATION_FIELD] = str(duration)
arr.append(obj)
@@ -173,7 +187,7 @@ class BarChartBuilder:
def set_height(self, inches):
self._set_size(inches, dim=1)
- def plot_bars(self, bar_labels, values):
+ def plot_bars(self, bar_labels, values, datetime_ticks=False):
numof_bars = len(bar_labels)
if not numof_bars:
@@ -181,12 +195,16 @@ class BarChartBuilder:
self._get_bar_axis().set_tick_params(labelleft=False)
return []
- self.set_height(numof_bars)
+ self.set_height(numof_bars / 2 if datetime_ticks else numof_bars)
bar_offsets = np.arange(numof_bars) * 2 * self._BAR_HEIGHT + self._BAR_HEIGHT
bar_axis_min, bar_axis_max = 0, 2 * self._BAR_HEIGHT * numof_bars
- self._get_bar_axis().set_ticks(bar_offsets)
+ if datetime_ticks:
+ self._get_bar_axis().set_ticks(bar_offsets - self._BAR_HEIGHT)
+ else:
+ self._get_bar_axis().set_ticks(bar_offsets)
+
self._get_bar_axis().set_ticklabels(bar_labels)
self.set_bar_axis_limits(bar_axis_min, bar_axis_max)
@@ -211,10 +229,14 @@ class PlotBuilder:
def _format_weekday(weekday):
return str(weekday)
+ def _format_hour(hour):
+ return '{}:00'.format(hour)
+
_FORMAT_KEY = {
Grouping.USER: _format_user,
Grouping.DATE: _format_date,
Grouping.WEEKDAY: _format_weekday,
+ Grouping.HOUR: _format_hour,
}
@staticmethod
@@ -239,8 +261,8 @@ class PlotBuilder:
def _extract_values(durations):
return tuple(map(PlotBuilder._duration_to_seconds, durations.values()))
- def process_database(self, grouping, db_reader):
- durations = grouping.enum_durations(db_reader)
+ def process_database(self, grouping, db_reader, date_from=None, date_to=None):
+ durations = grouping.enum_durations(db_reader, date_from, date_to)
bar_chart = BarChartBuilder()
@@ -258,7 +280,7 @@ class PlotBuilder:
if not labels or not max(durations):
bar_chart.set_value_axis_limits(0)
- bars = bar_chart.plot_bars(labels, durations)
+ bars = bar_chart.plot_bars(labels, durations, grouping is Grouping.HOUR)
bar_chart.set_property(bars, alpha=.33)
if self._fd is sys.stdout:
@@ -305,6 +327,11 @@ if __name__ == '__main__':
return OutputFormat(s)
except ValueError:
raise argparse.ArgumentError()
+ def date_range_limit(s):
+ try:
+ return datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
+ except ValueError:
+ raise argparse.ArgumentError()
parser.add_argument('input', type=argparse.FileType('r'),
help='database path')
@@ -323,9 +350,21 @@ if __name__ == '__main__':
choices=tuple(fmt for fmt in OutputFormat),
default=OutputFormat.CSV,
help='specify output format')
+ parser.add_argument('--from', type=date_range_limit, default=None,
+ dest='date_from',
+ help='set the date to process database records from')
+ parser.add_argument('--to', type=date_range_limit, default=None,
+ dest='date_to',
+ help='set the date to process database record to')
args = parser.parse_args()
+ if args.date_from is not None and args.date_to is not None:
+ if args.date_from > args.date_to:
+ args.date_from, args.date_to = args.date_to, args.date_from
+
with args.input_format.create_reader(args.input) as db_reader:
output_writer = args.output_format.create_writer(args.output)
- output_writer.process_database(args.grouping, db_reader)
+ output_writer.process_database(
+ args.grouping, db_reader, date_from=args.date_from,
+ date_to=args.date_to)
diff --git a/vk/utils/tracking/online_streaks.py b/vk/utils/tracking/online_streaks.py
index dfac0be..5c9aa48 100644
--- a/vk/utils/tracking/online_streaks.py
+++ b/vk/utils/tracking/online_streaks.py
@@ -22,8 +22,10 @@ class Weekday(Enum):
return self.name[0] + self.name[1:].lower()
class OnlineStreakEnumerator(MutableMapping):
- def __init__(self):
+ def __init__(self, date_from=None, date_to=None):
self._records = {}
+ self._date_from = date_from
+ self._date_to = date_to
def __getitem__(self, user):
return self._records[user]
@@ -40,11 +42,32 @@ class OnlineStreakEnumerator(MutableMapping):
def __len__(self):
return len(self._records)
+ def _cut_period(self, streak):
+ user, time_from, time_to = streak
+ #print(user.get_first_name(), time_from, self._date_from)
+ if self._date_from is not None:
+ if time_to < self._date_from:
+ #print(1)
+ return None
+ if time_from < self._date_from:
+ #print(2)
+ time_from = self._date_from
+ if self._date_to is not None:
+ if time_from > self._date_to:
+ #print(3)
+ return None
+ if time_to > self._date_to:
+ #print(4)
+ time_to = self._date_to
+ return user, time_from, time_to
+
def enum(self, db_reader):
for record in db_reader:
- period = self._insert_record(record)
- if period is not None:
- yield period
+ streak = self._insert_record(record)
+ if streak is not None:
+ streak = self._cut_period(streak)
+ if streak is not None:
+ yield streak
def group_by_user(self, db_reader):
by_user = {}
@@ -72,6 +95,15 @@ class OnlineStreakEnumerator(MutableMapping):
by_weekday[Weekday(date.weekday())] += duration
return by_weekday
+ def group_by_hour(self, db_reader):
+ by_hour = OrderedDict()
+ for i in range(24):
+ by_hour[i] = timedelta()
+ for _, time_from, time_to in self.enum(db_reader):
+ for hour, duration in self._enum_hours_and_durations(time_from, time_to):
+ by_hour[hour] += duration
+ return by_hour
+
@staticmethod
def _enum_dates_and_durations(time_from, time_to):
while time_from.date() != time_to.date():
@@ -80,6 +112,14 @@ class OnlineStreakEnumerator(MutableMapping):
time_from = next_day
yield time_to.date(), time_to - time_from
+ @staticmethod
+ def _enum_hours_and_durations(time_from, time_to):
+ while time_from.date() != time_to.date() or time_from.hour != time_to.hour:
+ next_hour = time_from.replace(minute=0, second=0) + timedelta(hours=1)
+ yield time_from.hour, next_hour - time_from
+ time_from = next_hour
+ yield time_to.hour, time_to - time_from
+
def _insert_record(self, record):
return self._insert_user(record.to_user())
@@ -95,6 +135,6 @@ class OnlineStreakEnumerator(MutableMapping):
return None
if user.is_online():
return None
- period = user, self[user].get_last_seen_time(), user.get_last_seen_time()
+ streak = user, self[user].get_last_seen_time(), user.get_last_seen_time()
self[user] = user
- return period
+ return streak