import glob
parsed = glob.glob('/home/karpathy/HNtrend/db/parsed_hn*.pgzip')
print 'total number of days of data: ', len(parsed)
# lets get a sense of the data and what these records look like
import gzippickle
print parsed[0]
records = gzippickle.load(parsed[0])
print records[0]['main']['posts'][0]
print records[0]['new']['posts'][0]
print records[0]['main']['posts'][1]
# lets visualize activity on one day. First compute activity for one day here
records = gzippickle.load(parsed[0]) # one of the days
# this utility function will create dict of [post id] -> [(t, points), ...]
# where t is time
def record_to_activity(records):
activity = {}
titles = {}
for r in records:
m = r['main']['posts'] + r['new']['posts'] # lets put all posts together from both pages
t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
pids_seen = []
for i,p in enumerate(m):
pid = p['id']
if pid in pids_seen: continue # there can be duplicates in NEW and FRONT
pids_seen.append(pid)
if not pid in activity: activity[pid] = []
L = activity[pid]
pts = p['points']
if pts > 0: # if its not positive we somehow had incorrect measurement
L.append((t, pts))
titles[pid] = p['title']
# make one more pass and sort all lists just to make sure time order is respected
for pid in activity: activity[pid].sort()
return (activity, titles)
activity, titles = record_to_activity(records)
print 'number of unique stories this day: ', len(activity)
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates
from datetime import datetime
import numpy as np
plt.rcParams['figure.figsize'] = (20.0, 8.0)
def activity_to_plot(activity):
fig = plt.figure()
ax = fig.add_subplot(111)
for pid, act in activity.items():
ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points) for this story
ds = [datetime.fromtimestamp(t) for t in ts]
plt.plot(ds, ps)
plt.title('Point values for stories on HN on one day')
hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
ax.xaxis.set_major_formatter(hfmt)
plt.xticks(rotation=25)
activity_to_plot(activity)
# Okay that was boring, lets instead visualize the derivative which should be more interesting
def activity_to_plot_gradient(activity, titles, window_size = 10*60, title_threshold = 10):
fig = plt.figure()
ax = fig.add_subplot(111)
for pid, act in activity.items():
ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points) for this story
arrt = np.array(ts) # convert time and points to numpy arrays
mint = arrt.min()
maxt = arrt.max()
arrp = np.array(ps)
# create a regular time grid, because stories can be sampled at different rates/times and its a mess
regt = np.arange(mint, maxt, window_size)
if regt.size <=1: continue # cant even compute derivative...
regp = np.interp(regt, arrt, arrp) # 1d interpolate the signal to regular intervals
# get derivative signal
regpd = np.gradient(regp)
#ds = [datetime.fromtimestamp(t) for t in regt]
plt.plot(regt, regpd)
# lets also annotate the most successful stories of that day
if regpd.max() > title_threshold:
maxix = np.argmax(regpd)
ax.annotate(titles[pid], xy=(regt[maxix], regpd[maxix]), xytext=(regt[maxix], regpd[maxix]+1), arrowprops=dict(facecolor='black', width=1, frac=0.5))
plt.title('Rate at which a stories get points, per %d seconds' % (window_size, ))
#hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
#ax.xaxis.set_major_formatter(hfmt)
#plt.xticks(rotation=25)
activity_to_plot_gradient(activity, titles)
# This is fun, lets do one more!
activity2, titles2 = record_to_activity(gzippickle.load(parsed[3]))
activity_to_plot_gradient(activity2, titles2)
# This one is funny as well :)
activity3, titles3 = record_to_activity(gzippickle.load(parsed[12]))
activity_to_plot_gradient(activity3, titles3)
# Lets build a dictionary that tells us the first time when any pid has appeared. This will be useful later on in analysis
pidt = {}
for t, ppath in days:
records = gzippickle.load(ppath)
for r in records:
posts = r['new']['posts']
t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
for p in posts:
pid = p['id']
if not pid in pidt:
pidt[pid] = t
# Show HN raw scores for stories. This is based on formula HN uses which is:
# score = (#votes - 1)^0.8 / (age_hour + 2)^1.8
# lets plot this over time for one day
from math import pow
# Lets also plot Hacker News score
def activity_to_plot_score(activity):
fig = plt.figure()
ax = fig.add_subplot(111)
for pid, act in activity.items():
ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points, minago) for this story
ds = [datetime.fromtimestamp(t) for t in ts]
tfirst = pidt[pid]
ss = [pow(x[1] - 1.0, 0.8) / pow((x[0]-tfirst)/60.0/60.0 + 2.0, 1.8) for x in act]
plt.plot(ds, ss)
plt.title('Raw ranking scores for stories on HN on one day')
hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
ax.xaxis.set_major_formatter(hfmt)
plt.xticks(rotation=25)
activity, titles = record_to_activity(gzippickle.load(parsed[0]))
activity_to_plot_score(activity)
# Okay that was pretty but lets get more serious
# Lets consider the entire span of all recording.
# Order all collected files in time according to their timestamp
# it's nice if you have a 32GB RAM machine here like I do :p
# otherwise you have to be a bit careful maybe (takes ~6GB)
import dateutil
days = []
for ppath in parsed:
timestamp = ppath[-32:-6]
t = dateutil.parser.parse(timestamp)
days.append((t, ppath))
days.sort() # put it all in order
allrecords = [] # build a huge database of all snapshots of HN, sorted in time
for t, ppath in days:
allrecords.extend(gzippickle.load(ppath))
print 'Total number of measurements:', len(allrecords)
activity, titles = record_to_activity(allrecords)
# visualize only the most popular stories but across all collection time
# we will have gaps here, as there were gaps in data collection
activity_top = {}
for a,L in activity.items():
max_points = max(x[1] for x in L)
if max_points > 200:
activity_top[a] = L
activity_to_plot_gradient(activity_top, titles, 10*60, 30)
# HN downvotes certain stories based on title keywords, flamewars in comments, and potentially domain names
# Lets investigate what stories get score penalized on Hacker News
# by looking for any stories that have a story with a lower score above it
# a deeper analysis on this is given in http://www.righto.com/2013/11/how-hacker-news-ranking-really-works.html
def investigate_ranking(records):
titles = {} # pid->title mapping
seen_first = {} # mapping of pid->time first seen on HN (i.e. submission time)
niter = 0
told = 0
contiguity_counter = 0
for r in records:
t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
# we have to worry about the fact that we have gaps in the data, so sometimes we can
# suddenly be looking at a time several days later. When this is the case, our
# seen_first variable will have a bad time. E.g. We see a story first at 9am, but the story
# could have been up several hours, influencing our score.
# Therefore, lets always process the NEW page, but lets process and validate the FRONT
# page only if we have at least 24 hours of contiguous data up to this point
# a measurement within the next 15 minutes we will consider OK.
# we can see the gaps sometimes if HN goes temporarily down as well
if t < told + 15*60:
contiguity_counter += 1
else:
contiguity_counter = 0
told = t
# process NEW page at this time
n = r['new']['posts']
for i,p in enumerate(n):
pid = p['id']
titles[pid] = p['title']
if not pid in seen_first: seen_first[pid] = t
# process FRONT page at 7 hour intervals, and only if we have at this point 24 hours
# of contiguously recorded data
niter += 1
if (niter % (60*7) == 0) and contiguity_counter > 1440:
#print '-----'
m = r['main']['posts']
min_score = 1000
for i,p in enumerate(m):
pid = p['id']
pts = p['points']
if pts > 0 and pid in seen_first:
t0 = seen_first[pid]
score = pow(pts - 1.0, 0.8) / pow((t - t0)/60.0/60.0 + 2.0, 1.8)
if score / min_score >= 2: # something with lower score is above us. Fishy!
print '%.2f->%.2f (%dc in %dm): (%s) %s' % (score, min_score, p['num_comments'], (t-t0)/60.0, p['domain'], titles.get(pid,''))
if score < min_score:
min_score = score
print 'Showing downvotes'
print 'a->b (c in d) stands for story downweighted from raw score a to score b or less, when it had c comments in d minutes'
print '---'
investigate_ranking(allrecords)
# Lets also put on our detective hats and find stories that were nuked,
# i.e. they were doing well on front page and suddenly disappeared
def investigate_nuking(records):
titles = {} # pid->title mapping
oldr = records[0]
told = 0
contiguity_counter = 0
suspect_nuked = {}
for r in records[1:]:
t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
# check if way too much time has elapsed since last snapshot (here too much = 90 seconds)
if t < told + 90:
# ok lets grab the two records: now and at most 90 seconds ago
m1 = oldr['main']['posts'] # last record
m2 = r['main']['posts'] # now
pid_to_story = {}
for p in m1:
pid_to_story[p['id']] = p
# lets only consider top 10, these should all make it across 90 seconds just fine
pids1 = [p['id'] for p in m1[:10]]
pids2 = [p['id'] for p in m2]
# try to look for suspect_nuked PIDs reappearing on the front page
for pid in suspect_nuked:
if pid in pids2:
suspect_nuked[pid] = '' # never mind
pid_missing = set(pids1).difference(pids2)
for pid in pid_missing:
p = pid_to_story[pid]
if p['points'] > 100:
suspect_nuked[pid] = '[%s] (%dp %dc in %dm): (%s) %s' % (pid, p['points'], p['num_comments'], p['minutes_ago'], p['domain'], p['title'])
told = t
oldr = r
for pid, text in suspect_nuked.items():
if text:
print text
print 'Showing almost definitely nuked stories'
print 'These stories were somewhere in top10 on front page and then up to 90 seconds later disappeared forever'
print 'Some of this could be due to duplicates clean up. You can see every story by visiting its link directly'
print 'The first number is the story ID, just head over to https://news.ycombinator.com/item?id=6278809 for example'
print '------------'
investigate_nuking(allrecords)
# Okay, now let's try to look for what makes a story successful on Hacker News.
# First lets load all stories we saw ever, on both front page and new stories page
# we are interested in finding signals that tell us what stoies make it from NEW page
# to FRONT page.
front_posts = {} # dictionaries of pid -> post
front_posts_top10 = {}
new_posts = {} # dictionaries of pid -> posts
for ppath in parsed:
#print 'analyzing ', ppath
records = gzippickle.load(ppath)
for r in records:
m = r['main']
for i,p in enumerate(m['posts']):
pid = p['id']
front_posts[pid] = p
if i<10:
front_posts_top10[pid] = p
n = r['new']
for i,p in enumerate(n['posts']):
pid = p['id']
new_posts[pid] = p
print 'total unique posts that we saw on front page: ', len(front_posts)
print 'and those that also made it to top 10: ', len(front_posts_top10)
print 'total unique posts that we saw on new page: ', len(new_posts)
print 'front page success ratio is %.2f%%' % (len(front_posts)*100.0 / len(new_posts), )
print 'top10 success ratio is %.2f%%' % (len(front_posts_top10)*100.0 / len(new_posts), )
fps = set(front_posts.keys())
nps = set(new_posts.keys())
pos = fps.intersection(nps)
neg = nps.difference(fps)
print 'total number of unique, successful posts: ', len(pos)
print 'total number of failed posts: ', len(neg)
# lets study effect of domains
from math import log
cp = {}
cn = {}
for pid in pos:
dom = new_posts[pid]['domain']
cp[dom] = cp.get(dom, 0.0) + 1
for pid in neg:
dom = new_posts[pid]['domain']
cn[dom] = cn.get(dom, 0.0) + 1
alldoms = set(cp.keys()).union(set(cn.keys()))
ratios = []
for dom in alldoms:
ratios.append((log(cp.get(dom, 0.0)+10.0) - log(cn.get(dom, 0.0)+10.0), dom)) # some smoothing here
print 'total number of unique domains: ', len(alldoms)
print '----------'
# now sort and display them
ratios.sort()
for rat in ratios[:30]:
d = rat[1]
print 'HN _hates_ domain %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
print '----------'
print 'terrible domains: submitted more than 20 times and nothing got to front page:'
print ', '.join(d for d in cn if cn[d] > 20 and cp.get(d, 0) == 0)
print '----------'
num_printed = 0
for rat in ratios[-1:0:-1]:
d = rat[1]
if cp.get(d,0)<=2: continue # never mind these, only one data point to go on
if num_printed < 40:
print 'HN _loves_ domain %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
num_printed += 1
# lets study effect of user now. I'll just copy paste of the above but replace users for domains
from math import log
cp = {}
cn = {}
for pid in pos:
dom = new_posts[pid]['user']
cp[dom] = cp.get(dom, 0.0) + 1
for pid in neg:
dom = new_posts[pid]['user']
cn[dom] = cn.get(dom, 0.0) + 1
alldoms = set(cp.keys()).union(set(cn.keys()))
ratios = []
for dom in alldoms:
ratios.append((log(cp.get(dom, 0.0)+5.0) - log(cn.get(dom, 0.0)+5.0), dom)) # some smoothing here
print 'total number of unique users: ', len(alldoms)
print '----------'
# now sort and display them
ratios.sort()
for rat in ratios[:20]:
d = rat[1]
print 'HN _hates_ user %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
print '----------'
for rat in ratios[-1:-20:-1]:
d = rat[1]
if cp.get(d,0)<=1: continue # never mind these, only one data point to go on
print 'HN _loves_ user %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
# Now lets look at chances of a story to work its way to Front page as a function of time of submission
# we'll use dictionary pidt we made before, which gives pid -> time this post was first seen
# Lets build up two arrays of size 7x24: they will store counts of positive and negative stories
# here, we definite a positive story as a story that reached the front page eventually
# All times are in PST.
arrpos = np.zeros((7, 24))
arrneg = np.zeros((7, 24))
for pid,t in pidt.iteritems():
dt = datetime.fromtimestamp(t)
# dt.weekday() gives 0 for Monday, ... 6 for Sunday
# print t, dt, dt.weekday(), dt.hour
wk = dt.weekday()
wt = dt.hour
if pid in pos:
arrpos[wk, wt] += 1.0
else:
arrneg[wk, wt] += 1.0
# helper function that smooths the array along the day, because
# we don't have THAT much data and this should vary slowly in time
def smooth_days(arr):
for i in range(7):
arr[i, :] = np.convolve(arr[i, :], np.array([0.5, 1, 0.5])/2.0, 'same')
smooth_days(arrpos)
smooth_days(arrneg)
arrtot = arrpos + arrneg
print 'basing the following on %d total stories...' % (arrtot.sum())
plt.figure()
plt.imshow(arrtot , interpolation='nearest', cmap='jet')
plt.title('total number submissions for every time slot')
plt.xlabel('hour of day')
plt.ylabel('weekday (0 = Monday)')
plt.figure()
logposratio = np.log(np.divide(arrpos, arrneg))
plt.imshow(logposratio, interpolation='nearest', cmap='jet')
plt.title('positiveness log ratio: log #positives / #negatives per slot')
print 'total number of submissions per week day, starting with Monday: ', arrtot.sum(axis=1).astype(int)
plt.figure()
plt.bar(np.arange(24), arrtot.sum(axis=0))
plt.title('#posts submitted per hour of the day (PST)')
# poof noon is busy time on HN!
# okay, now lets look at the best time to actually submit a story according to this
def my2str(t):
d,h = t
return ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'][d] + ', ' + `h` + ':00'
lprcp = np.copy(logposratio)
for k in range(10):
maxi = lprcp.argmax()
maxv = lprcp.max()
topk = np.unravel_index(maxi, lprcp.shape)
print 'it is great to submit on %s: %d/%d (logratio %.2f) go to front page' % (my2str(topk), arrpos[topk[0], topk[1]], arrneg[topk[0], topk[1]], maxv)
lprcp[topk[0], topk[1]] = -1000000 # hack... ;p
print '-----------'
lprcp = np.copy(-logposratio)
for k in range(10):
maxi = lprcp.argmax()
maxv = lprcp.max()
topk = np.unravel_index(maxi, lprcp.shape)
print 'it sucks to submit on %s: %d/%d (logratio %.2f) go to front page' % (my2str(topk), arrpos[topk[0], topk[1]], arrneg[topk[0], topk[1]], -maxv)
lprcp[topk[0], topk[1]] = -1000000 # hack... ;p
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2, stop_words = 'english', strip_accents = 'unicode', lowercase=True, ngram_range=(1,2))
vectorizer
pos_titles = [new_posts[pid]['title'] for pid in pos]
neg_titles = [new_posts[pid]['title'] for pid in neg]
X = vectorizer.fit_transform(pos_titles + neg_titles)
y = np.array([2]*len(pos_titles) + [1]*len(neg_titles))
import random
feats = vectorizer.get_feature_names()
print 'dimensions of tfidf sparse matrix (#examples x features): ', X.shape
print 'few random features: ', ', '.join(random.sample(feats, 20))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
# get classification accuracy for our training set
# this will tell us how easily we can tell if a story will make it to frontpage
# based ONLY on the words in it
yhat = clf.predict(X)
print 'predicting all stories as negative gives baseline accuracy: ', len(neg_titles)*1.0/(len(neg_titles)+len(pos_titles))
print 'naive bayes on train data accuracy: ', np.mean(yhat == y)
# Finally lets look at the features that are most predictive for the positive class
# (in our case if a story makes it from New to Front page)
pf = [(clf.feature_log_prob_[1, i], feats[i]) for i in range(len(feats))]
pf.sort(reverse=True)
for p in pf[:50]:
print 'Positive word %.2f: %s' % (p[0], p[1])
print ', '.join([x[1] for x in pf[:50]])