import glob
parsed = glob.glob('/home/karpathy/HNtrend/db/parsed_hn*.pgzip')
print 'total number of days of data: ', len(parsed)
# lets get a sense of the data and what these records look like
import gzippickle
print parsed[0]
records = gzippickle.load(parsed[0])
print records[0]['main']['posts'][0]
print records[0]['new']['posts'][0]
print records[0]['main']['posts'][1]
# lets visualize activity on one day. First compute activity for one day here
records = gzippickle.load(parsed[0]) # one of the days
# this utility function will create dict of [post id] -> [(t, points), ...]
# where t is time
def record_to_activity(records):
activity = {}
titles = {}
for r in records:
m = r['main']['posts'] + r['new']['posts'] # lets put all posts together from both pages
t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
pids_seen = []
for i,p in enumerate(m):
pid = p['id']
if pid in pids_seen: continue # there can be duplicates in NEW and FRONT
pids_seen.append(pid)
if not pid in activity: activity[pid] = []
L = activity[pid]
pts = p['points']
if pts > 0: # if its not positive we somehow had incorrect measurement
L.append((t, pts))
titles[pid] = p['title']
# make one more pass and sort all lists just to make sure time order is respected
for pid in activity: activity[pid].sort()
return (activity, titles)
activity, titles = record_to_activity(records)
print 'number of unique stories this day: ', len(activity)
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates
from datetime import datetime
import numpy as np
plt.rcParams['figure.figsize'] = (20.0, 8.0)
def activity_to_plot(activity):
fig = plt.figure()
ax = fig.add_subplot(111)
for pid, act in activity.items():
ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points) for this story
ds = [datetime.fromtimestamp(t) for t in ts]
plt.plot(ds, ps)
plt.title('Point values for stories on HN on one day')
hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
ax.xaxis.set_major_formatter(hfmt)
plt.xticks(rotation=25)
activity_to_plot(activity)
# Okay that was boring, lets instead visualize the derivative which should be more interesting
def activity_to_plot_gradient(activity, titles, window_size = 10*60, title_threshold = 10):
fig = plt.figure()
ax = fig.add_subplot(111)
for pid, act in activity.items():
ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points) for this story
arrt = np.array(ts) # convert time and points to numpy arrays
mint = arrt.min()
maxt = arrt.max()
arrp = np.array(ps)
# create a regular time grid, because stories can be sampled at different rates/times and its a mess
regt = np.arange(mint, maxt, window_size)
if regt.size <=1: continue # cant even compute derivative...
regp = np.interp(regt, arrt, arrp) # 1d interpolate the signal to regular intervals
# get derivative signal
regpd = np.gradient(regp)
#ds = [datetime.fromtimestamp(t) for t in regt]
plt.plot(regt, regpd)
# lets also annotate the most successful stories of that day
if regpd.max() > title_threshold:
maxix = np.argmax(regpd)
ax.annotate(titles[pid], xy=(regt[maxix], regpd[maxix]), xytext=(regt[maxix], regpd[maxix]+1), arrowprops=dict(facecolor='black', width=1, frac=0.5))
plt.title('Rate at which a stories get points, per %d seconds' % (window_size, ))
#hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
#ax.xaxis.set_major_formatter(hfmt)
#plt.xticks(rotation=25)
activity_to_plot_gradient(activity, titles)
# This is fun, lets do one more!
activity2, titles2 = record_to_activity(gzippickle.load(parsed[3]))
activity_to_plot_gradient(activity2, titles2)