In [1]:

import glob
parsed = glob.glob('/home/karpathy/HNtrend/db/parsed_hn*.pgzip')
print 'total number of days of data: ', len(parsed)

total number of days of data:  47

In [2]:

# lets get a sense of the data and what these records look like
import gzippickle
print parsed[0]
records = gzippickle.load(parsed[0])
print records[0]['main']['posts'][0]
print records[0]['new']['posts'][0]

/home/karpathy/HNtrend/db/parsed_hn_2013-10-15T04:00:10.971301.pgzip
{'domain': u'coffeepowered.net', 'title': u"No, Rails' CookieStore isn't broken", 'url': u'https://www.coffeepowered.net/2013/09/26/rails-session-cookies/', 'num_comments': 7, 'rank': 0, 'points': 26, 'user': u'thibaut_barrere', 'minutes_ago': -1, 'id': u'6546257'}
{'domain': u'archive.org', 'title': u'Backdoor found in D-Link Routers', 'url': u'http://web.archive.org/web/20131013143444/http://www.devttys0.com/2013/10/reverse-engineering-a-d-link-backdoor/', 'num_comments': 0, 'rank': 0, 'points': 1, 'user': u'DiabloD3', 'minutes_ago': 5, 'id': u'6546507'}

In [3]:

print records[0]['main']['posts'][1]

{'domain': u'maverickblogging.com', 'title': u'CookieStore vulnerability in Rails 2.x, 3.x, 4.x.', 'url': u'http://maverickblogging.com/logout-is-broken-by-default-ruby-on-rails-web-applications/', 'num_comments': 33, 'rank': 1, 'points': 49, 'user': u'dgellow', 'minutes_ago': 180, 'id': u'6545923'}

In [65]:

# lets visualize activity on one day. First compute activity for one day here
records = gzippickle.load(parsed[0]) # one of the days

# this utility function will create dict of [post id] -> [(t, points), ...]
# where t is time
def record_to_activity(records):
    activity = {}
    titles = {}
    for r in records:
        m = r['main']['posts'] + r['new']['posts'] # lets put all posts together from both pages
        t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
        pids_seen = []
        for i,p in enumerate(m):
            pid = p['id']
            if pid in pids_seen: continue # there can be duplicates in NEW and FRONT
            pids_seen.append(pid)
            if not pid in activity: activity[pid] = []
            L = activity[pid]
            pts = p['points']
            if pts > 0: # if its not positive we somehow had incorrect measurement
                L.append((t, pts))
            titles[pid] = p['title']
            
    # make one more pass and sort all lists just to make sure time order is respected
    for pid in activity: activity[pid].sort()
    return (activity, titles)

activity, titles = record_to_activity(records)
print 'number of unique stories this day: ', len(activity)

number of unique stories this day:  893

In [5]:

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates
from datetime import datetime
import numpy as np
plt.rcParams['figure.figsize'] = (20.0, 8.0)

In [19]:

def activity_to_plot(activity):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for pid, act in activity.items():
        ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points) for this story
        ds = [datetime.fromtimestamp(t) for t in ts]
        plt.plot(ds, ps)
    plt.title('Point values for stories on HN on one day')
    
    hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
    ax.xaxis.set_major_formatter(hfmt)
    plt.xticks(rotation=25)
                          
activity_to_plot(activity)

In [66]:

# Okay that was boring, lets instead visualize the derivative which should be more interesting

def activity_to_plot_gradient(activity, titles, window_size = 10*60, title_threshold = 10):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for pid, act in activity.items():
        ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points) for this story
        
        arrt = np.array(ts) # convert time and points to numpy arrays
        mint = arrt.min()
        maxt = arrt.max()
        arrp = np.array(ps)
        
        # create a regular time grid, because stories can be sampled at different rates/times and its a mess
        regt = np.arange(mint, maxt, window_size) 
        
        if regt.size <=1: continue # cant even compute derivative...
        regp = np.interp(regt, arrt, arrp) # 1d interpolate the signal to regular intervals
        
        # get derivative signal
        regpd = np.gradient(regp)
        #ds = [datetime.fromtimestamp(t) for t in regt]
        plt.plot(regt, regpd)
        
        # lets also annotate the most successful stories of that day
        if regpd.max() > title_threshold:
            maxix = np.argmax(regpd)
            ax.annotate(titles[pid], xy=(regt[maxix], regpd[maxix]), xytext=(regt[maxix], regpd[maxix]+1), arrowprops=dict(facecolor='black', width=1, frac=0.5))
        
    plt.title('Rate at which a stories get points, per %d seconds' % (window_size, ))
    #hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
    #ax.xaxis.set_major_formatter(hfmt)
    #plt.xticks(rotation=25)

activity_to_plot_gradient(activity, titles)

In [76]:

# This is fun, lets do one more!
activity2, titles2 = record_to_activity(gzippickle.load(parsed[3]))
activity_to_plot_gradient(activity2, titles2)

In [75]:

# This one is funny as well :)
activity3, titles3 = record_to_activity(gzippickle.load(parsed[12]))
activity_to_plot_gradient(activity3, titles3)

In [60]:

# Lets build a dictionary that tells us the first time when any pid has appeared. This will be useful later on in analysis

pidt = {}
for t, ppath in days:
    records = gzippickle.load(ppath)
    for r in records:
        posts = r['new']['posts']
        t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
        for p in posts:
            pid = p['id']
            if not pid in pidt:
                pidt[pid] = t

In [63]:

# Show HN raw scores for stories. This is based on formula HN uses which is:
# score = (#votes - 1)^0.8 / (age_hour + 2)^1.8
# lets plot this over time for one day

from math import pow

# Lets also plot Hacker News score
def activity_to_plot_score(activity):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for pid, act in activity.items():
        ts, ps = zip(*(act)) # unpack the list of tuples of form (t, points, minago) for this story
        ds = [datetime.fromtimestamp(t) for t in ts]
        tfirst = pidt[pid]
        ss = [pow(x[1] - 1.0, 0.8) / pow((x[0]-tfirst)/60.0/60.0 + 2.0, 1.8) for x in act]
        plt.plot(ds, ss)
            
    plt.title('Raw ranking scores for stories on HN on one day')
    hfmt = matplotlib.dates.DateFormatter('%m/%d %H:%M')
    ax.xaxis.set_major_formatter(hfmt)
    plt.xticks(rotation=25)

activity, titles = record_to_activity(gzippickle.load(parsed[0]))
activity_to_plot_score(activity)

In [64]:

# Okay that was pretty but lets get more serious
# Lets consider the entire span of all recording.
# Order all collected files in time according to their timestamp
# it's nice if you have a 32GB RAM machine here like I do :p
# otherwise you have to be a bit careful maybe (takes ~6GB)
import dateutil
days = []
for ppath in parsed:
    timestamp = ppath[-32:-6]
    t = dateutil.parser.parse(timestamp)
    days.append((t, ppath))
days.sort() # put it all in order
allrecords = [] # build a huge database of all snapshots of HN, sorted in time
for t, ppath in days:
    allrecords.extend(gzippickle.load(ppath))
print 'Total number of measurements:', len(allrecords)
activity, titles = record_to_activity(allrecords)

Total number of measurements: 59794

In [100]:

# visualize only the most popular stories but across all collection time
# we will have gaps here, as there were gaps in data collection
activity_top = {}
for a,L in activity.items():
    max_points = max(x[1] for x in L)
    if max_points > 200:
        activity_top[a] = L
activity_to_plot_gradient(activity_top, titles, 10*60, 30)

In [91]:

# HN downvotes certain stories based on title keywords, flamewars in comments, and potentially domain names
# Lets investigate what stories get score penalized on Hacker News
# by looking for any stories that have a story with a lower score above it

# a deeper analysis on this is given in http://www.righto.com/2013/11/how-hacker-news-ranking-really-works.html

def investigate_ranking(records):
    titles = {} # pid->title mapping
    seen_first = {} # mapping of pid->time first seen on HN (i.e. submission time)
    niter = 0
    told = 0
    contiguity_counter = 0
    for r in records:
        
        t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
        
        # we have to worry about the fact that we have gaps in the data, so sometimes we can 
        # suddenly be looking at a time several days later. When this is the case, our 
        # seen_first variable will have a bad time. E.g. We see a story first at 9am, but the story
        # could have been up several hours, influencing our score.
        # Therefore, lets always process the NEW page, but lets process and validate the FRONT
        # page only if we have at least 24 hours of contiguous data up to this point
        
        # a measurement within the next 15 minutes we will consider OK.
        # we can see the gaps sometimes if HN goes temporarily down as well
        if t < told + 15*60: 
            contiguity_counter += 1
        else:
            contiguity_counter = 0
        told = t
        
        # process NEW page at this time
        n = r['new']['posts']
        for i,p in enumerate(n):
            pid = p['id']
            titles[pid] = p['title']
            if not pid in seen_first: seen_first[pid] = t
            
        # process FRONT page at 7 hour intervals, and only if we have at this point 24 hours
        # of contiguously recorded data
        niter += 1
        if (niter % (60*7) == 0) and contiguity_counter > 1440:
            #print '-----'
            m = r['main']['posts']
            min_score = 1000
            for i,p in enumerate(m):
                pid = p['id']
                pts = p['points']
                if pts > 0 and pid in seen_first:
                    t0 = seen_first[pid]
                    score = pow(pts - 1.0, 0.8) / pow((t - t0)/60.0/60.0 + 2.0, 1.8)
                    if score / min_score >= 2: # something with lower score is above us. Fishy!
                        print '%.2f->%.2f (%dc in %dm): (%s) %s' % (score, min_score, p['num_comments'], (t-t0)/60.0, p['domain'], titles.get(pid,''))
                    if score < min_score:
                        min_score = score
        
print 'Showing downvotes'
print 'a->b (c in d) stands for story downweighted from raw score a to score b or less, when it had c comments in d minutes'
print '---'
investigate_ranking(allrecords)

Showing downvotes
a->b (c in d) stands for story downweighted from raw score a to score b or less, when it had c comments in d minutes
---
2.38->0.94 (872c in 1017m): (adityamukerjee.net) Don't Fly During Ramadan
1.07->0.31 (48c in 370m): (flipagr.am) Offshore company steals a startup's identity, Apple ignores it.
0.68->0.21 (0c in 45m): (businessinsider.com) The Truth About Marissa Mayer: An Unauthorized Biography
1.09->0.21 (62c in 484m): (thestringer.com.au) Google and the NSA: Who’s holding the ‘shit-bag’ now?
1.04->0.19 (95c in 256m): (github.com) In many ways PHP is just now starting to come into its own.
0.75->0.34 (1c in 10m): () We're only being dishonest to get your attention. Join us!
1.13->0.33 (10c in 375m): (aftenposten.no) Dear Prime Minister Cameron – from editors of four Nordic newspapers
0.71->0.24 (16c in 149m): (youtube.com) Marissa Mayer teaching at Stanford University
0.89->0.18 (15c in 209m): (gifdanceparty.com) Gif Dance Party
0.93->0.18 (112c in 313m): (skofo.github.io) The $80 web development workstation
0.84->0.32 (52c in 239m): (garthontech.blogspot.ca) Reddit has the killer social network feature - Anonymity 
0.60->0.19 (67c in 449m): (jameslarisch.com) How Not to Sell Bitcoin on eBay for 300% Profit
0.47->0.15 (17c in 795m): (aftenposten.no) Dear Prime Minister Cameron – from editors of four Nordic newspapers
1.50->0.45 (32c in 94m): (sfgate.com) Readers offer tales of Silicon Valley's ageism
1.01->0.42 (7c in 59m): () Ask HN: Why aren't there HTTP style standards for invoicing things?
1.73->0.30 (41c in 145m): (reuters.com) U.S. spy agency bugged U.N. headquarters: Germany's Spiegel
1.37->0.30 (7c in 75m): (iamsellingsocialinteraction.com) I am selling social interaction
0.99->0.18 (26c in 455m): (eff.org) EFF:  NSA Domestic Spying Timeline
0.56->0.18 (7c in 213m): (businessweek.com) Abandoned McDonald's Holds Glimpse of Life on Moon (video)
1.97->0.63 (97c in 171m): (nytimes.com) How to Charge $546 for Six Liters of Saltwater
0.53->0.22 (25c in 321m): () AWS Elastic Block Store issues in one AZ in us-east-1
1.29->0.34 (57c in 285m): (foreignpolicy.com) CIA Files Prove America Helped Saddam as He Gassed Iran
2.00->1.00 (69c in 323m): (cnet.com) Matt Damon: Edward Snowden did a great thing
2.26->0.82 (6c in 92m): (quora.com) How Stripe does engineering interviews
1.81->0.60 (10c in 158m): () Reuters swaps story about NSA hacking the UN for NSA fluff piece.
1.34->0.66 (80c in 287m): (google.ps) Google palestine hacked
1.96->0.46 (66c in 230m): (nypost.com) Tutor reveals the Ivy-admissions madness of rich penthouse parents
1.19->0.39 (394c in 863m): (orenhazi.com) Defeated
0.63->0.19 (87c in 597m): (istumblrdown.com) IsTumblrDown hit with a Cease & Desist
2.09->0.64 (52c in 148m): (businessinsider.com) Largest Homeless Camp In US is in Silicon Valley
1.19->0.38 (49c in 307m): (salon.com) The parody shirt the NSA doesn’t want you to wear 
0.82->0.30 (29c in 379m): (theatlantic.com) Incredible pictures from the Redbull Photography Contest
1.05->0.29 (56c in 355m): (istumblrdown.com) Retraction from Tumblr
1.76->0.81 (7c in 46m): () Ask HN: Who is hiring? (September 2013)
1.12->0.44 (62c in 215m): (pastebin.com)  We may get fired and I don't know what to do
0.49->0.20 (5c in 46m): () Ask HN: Freelancer? Seeking freelancer? (September 2013)
1.07->0.46 (188c in 466m): () Ask HN: Who is hiring? (September 2013)
0.85->0.33 (47c in 446m): (hireprogrammersleavingsyria.org) Show HN: Let's help programmers in/displaced from Syria get jobs elsewhere
0.84->0.42 (125c in 437m): (marketplace.org) What did the tech CEO say to the worker he wanted to automate?
0.92->0.29 (34c in 267m): (businessinsider.com.au) Kevin Spacey sums up why the streaming model will win: "It's All Just Story."
2.08->0.99 (128c in 209m): (righto.com) 9 Hacker News comments I'm tired of seeing
1.01->0.37 (5c in 85m): (jimleff.blogspot.com) How I Outgrew Libertarianism
0.60->0.25 (0c in 56m): () Offer HN: I'd like to help Freelancers win more clients - no charge
1.06->0.18 (16c in 127m): (nextmicrosoftceo.com) Linus and RMS lead the popular vote for Next Microsoft CEO
0.39->0.15 (188c in 857m): (marketplace.org) What did the tech CEO say to the worker he wanted to automate?
2.27->0.80 (122c in 259m): (baugues.com) Developers are the autoworkers of our generation
1.31->0.44 (88c in 298m): (nidium.com) Show HN: NiDIUM, a new breed of browser engine
1.37->0.33 (21c in 87m): (readwrite.com) Why JavaScript Will Become The Dominant Programming Language Of The Enterprise
1.09->0.24 (12c in 106m): (bbc.co.uk) 'Walkie-Talkie' skyscraper melts Jaguar car
1.02->0.21 (24c in 268m): (railscasts.com) Railscasts Update
0.82->0.39 (90c in 367m): (economist.com) Facebook is bad for you Get a life
1.73->0.35 (31c in 136m): (trybtc.com) Try Bitcoin
1.30->0.50 (4c in 119m): () Ask HN Googlers: would you invite a fellow geek to visit the Googleplex?
2.05->0.58 (178c in 328m): (android.com) New Android Release: KitKat
1.84->0.58 (49c in 112m): (storyblog.io) Anxiety over the new Gmail Compose
1.39->0.38 (34c in 77m): (docs.google.com) 17 y/o kicked out of house - seeking devilishly motivated partner
4.08->0.31 (47c in 108m): (cryptome.org) "Forensics for Prosecutors" mentions backdoor in TrueCrypt (page 15) [pdf]
1.07->0.30 (62c in 431m): (tsaoutofourpants.wordpress.com) Judge Allows TSA to Lie in FOIA Responses
1.55->0.64 (123c in 396m): () Warning: Google Authenticator upgrade loses all accounts
2.22->0.83 (56c in 101m): (engadget.com) Samsung unveils Galaxy Gear smartwatch with built-in camera, 70 apps
1.71->0.72 (8c in 81m): () MyOpenID will be turned off on February 1, 2014
1.35->0.61 (2c in 67m): (james-carr.org) Parameterized Docker Containers
5.81->1.48 (287c in 261m): (nytimes.com) N.S.A. Foils Much Internet Encryption
1.88->0.78 (6c in 85m): (medium.com) Radical Inclusion vs. Radical Self-Reliance at Burning Man (by Dustin Moskovitz)
3.44->0.78 (58c in 234m): (theguardian.com)  The US government has betrayed the internet. We need to take it back
1.40->0.31 (68c in 166m): (haverzine.com) Microsoft to do away with "Nokia" and "Lumia" branding
1.89->0.43 (343c in 681m): (nytimes.com) N.S.A. Foils Much Internet Encryption
0.89->0.24 (35c in 245m): (medium.com) Women in Tech - Embrace your inner power
1.13->0.24 (82c in 654m): (theguardian.com)  The US government has betrayed the internet. We need to take it back
1.12->0.24 (42c in 135m): (ieee.org) Are STEM Workers Overpaid?
1.07->0.24 (53c in 423m): (schneier.com) How Advanced Is the NSA's Cryptanalysis, and Can We Resist It?
1.00->0.21 (26c in 446m): (wired.com) Did NSA Put a Secret Backdoor in New Encryption Standard? (2007)
1.70->0.34 (6c in 53m): (swaggadocio.com) Why Facebook *really* bought Parse (YC S11)
0.50->0.25 (53c in 409m): (indiegogo.com) $349 Ubuntu Linux Desktop
2.20->0.60 (48c in 101m): () Ask HN: How to best acquire theoretical Computer Science knowledge?
0.93->0.45 (7c in 54m): () Ask HN: Best paid VPN?
0.98->0.22 (87c in 408m): (wired.com) NSA Revelations Cast Doubt on the Entire Tech Industry
1.22->0.47 (138c in 433m): (the-libertarian.co.uk) Headteacher reported London student to police for his blog
0.68->0.29 (91c in 522m): () Ask HN: How to best acquire theoretical Computer Science knowledge?
0.74->0.29 (43c in 349m): () Ask HN: Running a hackathon and someone's laptop got stolen, what should we do?
2.87->0.67 (25c in 196m): (washingtonpost.com) Obama administration had restrictions on NSA reversed in 2011
1.45->0.21 (14c in 235m): (anewdomain.net) NSA Spying: Are Americans Now the Enemy?
1.00->0.15 (112c in 494m): (motherjones.com) Snowden Disclosures Finally Hit 12 on a Scale of 1 to 10
0.85->0.15 (17c in 460m): (slashdot.org) John Gilmore: NSA obstructed development of IPSEC Crypto in Linux Kernel
0.50->0.14 (163c in 853m): (the-libertarian.co.uk) Headteacher reported London student to police for his blog
0.84->0.42 (4c in 94m): ()  On Lisp: Happy 20th Birthday
0.55->0.26 (62c in 397m): (namebox.io) Namebox - sell your unwanted or unused domains
0.87->0.17 (54c in 617m): (washingtonpost.com) Obama administration had restrictions on NSA reversed in 2011
0.25->0.12 (102c in 940m): (chentir.com) The 10 commandments for happiness and success
1.09->0.23 (7c in 24m): (aol.co.uk) The first hack at TechCrunch Disrupt's hackathon is called TitStare.
0.74->0.23 (61c in 555m): (erratasec.com) You are committing a crime right now (2012)
1.21->0.36 (247c in 644m): (washingtonpost.com) Left with Nothing
3.53->0.68 (63c in 189m): (uncrunched.com) (Trying To) Face Down The Evil At TechCrunch Disrupt
2.18->0.45 (31c in 229m): (yahoopolicy.tumblr.com) Yahoo Files Suit Demanding Greater Accountability from the U.S. Government
1.31->0.50 (132c in 303m): (cnbc.com) The Student Loan Bubble is Starting To Burst
2.23->0.61 (273c in 453m): (usenix.org) To my daughter's high school programming teacher
0.77->0.31 (12c in 198m): () Exec Shuts Down Non-Cleaning Service
1.56->0.29 (44c in 128m): (indiegogo.com) Tiny Inconspicuous Handcuff Key
1.99->0.40 (70c in 115m): (change.org) Torvalds' response to whether RdRand could be compromised in the Linux kernel
1.10->0.33 (416c in 874m): (usenix.org) To my daughter's high school programming teacher
1.29->0.54 (187c in 412m): (fsf.org) Free Software Foundation issues statement on new iPhone models from Apple
1.46->0.48 (56c in 474m): (youtube.com) Amazing 2D▸3D photo manipulation
1.42->0.45 (132c in 393m): (revdancatt.com) Why BuzzFeed Should Pay My Invoice For Copyright Infringement
0.66->0.25 (95c in 640m): (itv.com) Huge water reserve discovered in Kenya
1.02->0.23 (9c in 83m): (onolan.org) What Does it Really Mean to be Non-Profit?
0.99->0.21 (9c in 55m): (medium.com) Amsterdam's Startup Manifesto
0.62->0.19 (61c in 894m): (youtube.com) Amazing 2D▸3D photo manipulation
6.56->1.35 (73c in 78m): (nytimes.com) A Plea for Caution From Russia
1.35->0.55 (75c in 211m): (bloomberg.com) Microsoft's Concept Videos From 2000: Why Didn't Ballmer Build Any of It?
1.20->0.31 (33c in 382m): (fair.org) Barrett Brown Can't Talk About Why the Government Wants to Jail Him
1.73->0.28 (182c in 401m): () Poll: Are there too many NSA stories on HN, or not enough?
1.67->0.42 (21c in 125m): (sophos.com) Linus on /dev/random: "We actually know what we are doing. You don't."
1.25->0.42 (48c in 159m): (arko.net) Vim is the worst editor, except all the other editors
1.51->0.29 (198c in 497m): (nytimes.com) Putin: A Plea for Caution From Russia
2.91->0.56 (71c in 155m): (alexstechthoughts.com) Stop Trying To Force The Startup Life On Everyone (It Is Not For Everyone)
1.42->0.49 (12c in 62m): (instagram.com) Flying frog discovered during NASA rocket launch
1.11->0.35 (79c in 484m): (mozilla.org) H2 {position: sticky}
1.31->0.27 (2c in 88m): (danshipper.com) What I learned bootstrapping a 6-figure business from school
2.91->1.32 (48c in 140m): () Bruce Schneier just changed his PGP key to 4096 bits
2.64->0.76 (13c in 144m): (twitter.com) Twitter confirms IPO plans via Twitter
2.09->0.55 (62c in 216m): (newyorker.com) How Chris McCandless Died [poisoned by wild Alaska seeds]
0.75->0.28 (147c in 504m): (seattletimes.com) Ocean acidification threatens marine life on a scale almost too big to fathom.
0.87->0.36 (1c in 24m): () Star Wars opening crawl (and beyond) from 1977 without HTML/CSS/JS
0.82->0.28 (9c in 381m): (techdirt.com) The History Of The Fake "Free Public WiFi" You Always See At Airports
2.95->0.70 (20c in 113m): (bristolcrypto.blogspot.co.uk) Open Letter From UK Security Researchers
1.12->0.35 (25c in 170m): (thehundreds.com) At Twitter
2.01->0.35 (32c in 59m): (bowery.io) Bowery: The Future of Web Development
1.46->0.32 (124c in 408m): (cpbotha.net) Dear USA, my data has left your building.
1.20->0.37 (51c in 501m): (arstechnica.com) Angry entrepreneur replies to patent troll with racketeering lawsuit
0.86->0.28 (27c in 116m): (reverse-disoma.tumblr.com) Girl Tickets to ng-conf
1.80->0.37 (14c in 82m): (myscienceacademy.org) The sad reality of why we are not living in the moment
0.60->0.26 (139c in 592m): (semiaccurate.com) Intel proves once and for all that PCs are not coming back
0.93->0.23 (9c in 301m): (congress.gov) H.R.2818 - Surveillance State Repeal Act
1.87->0.67 (41c in 119m): () Ask HN: How much savings did you have before quitting your industry job?
1.50->0.32 (28c in 344m): (petewarden.com) Why OpenHeatMap is banned from Github
2.81->1.22 (352c in 528m): () Why can a scam company raise $40 Million Series C + $76 Million Series B?
1.85->0.55 (30c in 162m): (chabris.com)  Why Malcolm Gladwell Matters (And Why That's Unfortunate)
2.17->0.82 (116c in 194m): (salon.com) My embarrassing picture went viral
2.73->0.54 (43c in 285m): (easydns.org) Whatever happened to due process?
1.34->0.54 (206c in 401m): (google.com) HP Chromebook 11
1.41->0.50 (100c in 242m): (firstround.com) There's a .00006% Chance of Building a Billion Dollar Company
0.96->0.29 (78c in 285m): (treasury.gov) 1 Mo. Treasury Jumps from 10 to 27 bps in 7 days
3.09->0.65 (82c in 213m): (rjmetrics.com) Our Logo Looks Like Underpants: A Case Study in Internationalization
2.02->0.65 (42c in 169m): (theguardian.com) Airbnb refuses to hand over users' data
2.98->0.78 (77c in 142m): (plus.google.com) Opt-out from Google using you as an ad, one checkbox 
0.98->0.48 (10c in 32m): (svbtle.com) How I left a 4 year career in finance and became a software engineer in 5 months
1.16->0.48 (65c in 233m): (thenextweb.com) Google can use your name and photo alongside online ads
0.97->0.48 (213c in 572m): (wired.com) Formula 1′s Leading Team Has a Big Secret
0.88->0.26 (32c in 274m): (volokh.com) Lavabit Challenges Contempt Order: An Analysis of Its Arguments
2.84->0.37 (259c in 237m): (blogjustine.wordpress.com) Because it needs to be said
1.46->0.33 (34c in 309m): (pastebin.com) Dear Rockstar - Please don't make me rich
1.24->0.52 (105c in 289m): (levels.io) What I learnt from bootstrapping my startup from Thailand in six months
0.43->0.17 (143c in 709m): (levels.io) What I learnt from bootstrapping my startup from Thailand in six months
2.19->0.64 (15c in 80m): (swizec.com) Inspired by the 180 websites I will understand 52 academic papers in 52 weeks
3.26->0.76 (5c in 143m): (wikileaks.org) Edward Snowden wins Sam Adams award [video]
1.01->0.37 (62c in 525m): (zoomquilt.org) Canvas + js is CRAZY
0.81->0.21 (12c in 224m): (popehat.com) Biology-Online.org, "Urban Whores," And The Many Axes Of Douchebaggery
1.31->0.37 (68c in 278m): (banksy.co.uk) Better Out Than In
0.95->0.19 (11c in 102m): (github.com) Unofficial Github Cards
1.17->0.39 (10c in 105m): (greenspun.com) How to Become as Rich as Bill Gates
1.05->0.27 (14c in 141m): (github.com) Show HN: Reportr - Your life's personal dashboard
3.40->0.76 (77c in 160m): (ggsidedocs.blogspot.com.br) My statement and the Guardian's
1.50->0.63 (12c in 145m): (zactownsend.com) Getting in to YC
2.27->0.97 (35c in 191m): () Hacking the Olympics
1.09->0.49 (59c in 223m): (acm.org) Barbarians at the Gateways: High-frequency Trading and Exchange Technology
2.24->0.46 (84c in 378m): (ncl.ac.uk) Adi Shamir--the "S" in RSA encryption--Prevented from Attending US Crypto Talks
0.98->0.46 (111c in 361m): (windows.com) Windows 8.1 now available
1.64->0.42 (32c in 248m): (kickstarter.com) Myst creator Cyan Worlds announces Kickstarter for Obduction
1.41->0.42 (18c in 40m): (gramfeed.com) Banksy artwork vandalized in New York - Before and After Photos
1.40->0.60 (17c in 98m): (stopwatching.us) Project Megaphone: Fight Surveillance With 9 Lines of Code
2.01->0.39 (103c in 307m): (tsaoutofourpants.wordpress.com) TSA Admits In Leaked Doc: No Evidence of Terrorist Plots Against Aviation in US
1.06->0.45 (208c in 669m): () Why we revert to original titles
1.15->0.24 (40c in 127m): (ruhoh.com) Before moving your startup to NYC or SF
0.53->0.24 (215c in 1091m): () Why we revert to original titles
1.05->0.50 (27c in 212m): () Ask HN: Making a living selling software components (not SaaS)?
2.28->0.50 (36c in 170m): (macaw.co) Stop writing code. Start drawing it.
0.86->0.29 (107c in 358m): (superuser.com) What are the Windows A: and B: drives used for?
0.92->0.26 (10c in 77m): (globalnerdy.com) How I Embarrassed Myself in My Interview With Google
0.99->0.20 (10c in 118m): (techcrunch.com) The Ultimate Cheat Sheet For Reinventing Yourself
0.89->0.39 (53c in 353m): () How to destroy someone who hosts his stuff at Hetzner dedicated server
0.41->0.17 (39c in 632m): () Ask HN: Making a living selling software components (not SaaS)?
0.35->0.12 (69c in 773m): () How to destroy someone who hosts his stuff at Hetzner dedicated server
0.77->0.17 (2c in 136m): (theherald.com.au) The ocean is broken
1.32->0.54 (16c in 61m): () Ask HN: How many of you contribute to Wikipedia?
2.76->0.54 (15c in 58m): (magnatecha.com) Why Does Apple Sabotage the MacBook?
0.93->0.27 (0c in 88m): (jeffreifman.com) Ten Services the NSA Could Offer to Make Spying Popular with Americans #NSAapps
6.86->2.01 (114c in 217m): () Poll: Where are you from? (currently living)
2.67->1.03 (44c in 52m): (nytimes.com) Healthcare.gov "contains about 500 million lines of software code"
1.77->0.67 (41c in 181m): () Skype does a clever trick when bandwidth is scarce
1.58->0.45 (54c in 145m): (atomicobject.com) Why I Stick with the Linux Desktop

In [105]:

# Lets also put on our detective hats and find stories that were nuked, 
# i.e. they were doing well on front page and suddenly disappeared

def investigate_nuking(records):
    titles = {} # pid->title mapping
    oldr = records[0]
    told = 0
    contiguity_counter = 0
    suspect_nuked = {}
    for r in records[1:]:
        
        t = int(r['time'].strftime("%s")) # seconds since epoch when this record was generated
        
        # check if way too much time has elapsed since last snapshot (here too much = 90 seconds)
        if t < told + 90:
        
            # ok lets grab the two records: now and at most 90 seconds ago
            m1 = oldr['main']['posts'] # last record
            m2 = r['main']['posts'] # now
            
            pid_to_story = {}
            for p in m1:
                pid_to_story[p['id']] = p
                
            # lets only consider top 10, these should all make it across 90 seconds just fine
            pids1 = [p['id'] for p in m1[:10]] 
            pids2 = [p['id'] for p in m2]
            
            # try to look for suspect_nuked PIDs reappearing on the front page
            for pid in suspect_nuked:
                if pid in pids2:
                    suspect_nuked[pid] = '' # never mind
            
            pid_missing = set(pids1).difference(pids2)
            
            for pid in pid_missing:
                p = pid_to_story[pid]
                if p['points'] > 100:
                    suspect_nuked[pid] = '[%s] (%dp %dc in %dm): (%s) %s' % (pid, p['points'], p['num_comments'], p['minutes_ago'], p['domain'], p['title'])
            
        told = t
        oldr = r
    
    for pid, text in suspect_nuked.items():
        if text:
            print text
        
            
print 'Showing almost definitely nuked stories'
print 'These stories were somewhere in top10 on front page and then up to 90 seconds later disappeared forever'
print 'Some of this could be due to duplicates clean up. You can see every story by visiting its link directly'
print 'The first number is the story ID, just head over to https://news.ycombinator.com/item?id=6278809 for example'
print '------------'
investigate_nuking(allrecords)

Showing almost definitely nuked stories
These stories were somewhere in top10 on front page and then up to 90 seconds later disappeared forever
Some of this could be due to duplicates clean up. You can see every story by visiting its link directly
The first number is the story ID, just head over to https://news.ycombinator.com/item?id=6278809 for example
------------
[6271345] (124p 99c in 360m): (nycfreshmarket.com) This is what happens when a designer gets unpaid
[6390832] (143p 23c in 180m): (theguardian.com) Russell Brand and the GQ awards: 'It's amazing how absurd it seems'
[6338145] (174p 47c in 120m): (schneier.com) The NSA Is Breaking Most Encryption on the Internet
[6255728] (233p 98c in 480m): (paulbernal.wordpress.com) Four fears for authoritarians
[6278809] (117p 60c in 120m): (usatoday.com) Tesla outsells Porsche, Jaguar, others in California
[6632693] (242p 73c in 360m): (zdnet.com) Apple censors Lawrence Lessig over warranty information
[6333203] (502p 339c in 1200m): (mailpile.is) PayPal Freezes Mailpile Campaign Funds
[6446077] (194p 115c in 300m): (theguardian.com) Revealed: Qatar's World Cup 'slaves'
[6314068] (177p 47c in 420m): (zenpencils.com)  	Startup - Bill Watterson, a cartoonist's advice 
[6361558] (269p 655c in 660m): (techcrunch.com) Apple Unveils The iPhone 5S
[6514227] (151p 49c in 300m): (startupmoon.com) The cold emails which got me meetings at Twitter, LinkedIn and GitHub
[6443753] (187p 129c in 240m): (thedrum.com) EasyJet refuses to let columnist on board for sending critical tweet
[6309716] (105p 60c in -1m): (asiantribune.com) Rape of Iraqi Women by US Forces as Weapon of War
[6579294] (266p 234c in 540m): (theguardian.com) Why have young people in Japan stopped having sex?
[6537834] (139p 53c in 480m): (plus.google.com) Click this link to opt out of Google's shared endorsements program
[6378241] (114p 51c in 120m): (duckduckgo.com) Duck Duck Go is blowing up again
[6526761] (180p 181c in 300m): (digitaltrends.com) We paid $634 million for the Obamacare sites and all we got was this lousy 404
[6326469] (273p 141c in 360m): (kitkat.com) KitKat's new website
[6455391] (909p 405c in 600m): () Why can a scam company raise $40 Million Series C + $76 Million Series B?
[6339639] (193p 94c in 180m): (thenextweb.com) Elon Musk took the futuristic gesture interface from Iron Man and made it real
[6364044] (192p 189c in 420m): (fsf.org) Free Software Foundation issues statement on new iPhone models from Apple
[6333295] (183p 48c in 300m): (ycombinator.com) Hacker News: Best
[6336165] (163p 18c in -1m): (theguardian.com) US and UK spy agencies defeat privacy and security on the internet
[6306219] (239p 89c in 480m): (dumpert.nl) Calling the NSA: "I accidentally deleted an e-mail, can you help me recover it?"
[6255815] (268p 57c in 360m): (theguardian.com)  The detention of David Miranda was an unlawful use of the Terrorism Act
[6520074] (225p 102c in 300m): (abs-cbnnews.com) Pinay traumatized by horror trip to US

In [381]:

# Okay, now let's try to look for what makes a story successful on Hacker News.

# First lets load all stories we saw ever, on both front page and new stories page
# we are interested in finding signals that tell us what stoies make it from NEW page
# to FRONT page.
front_posts = {} # dictionaries of pid -> post
front_posts_top10 = {}
new_posts = {} # dictionaries of pid -> posts
for ppath in parsed:
    #print 'analyzing ', ppath
    records = gzippickle.load(ppath)
    for r in records:
        m = r['main']
        for i,p in enumerate(m['posts']):
            pid = p['id']
            front_posts[pid] = p
            if i<10:
                front_posts_top10[pid] = p
        n = r['new']
        for i,p in enumerate(n['posts']):
            pid = p['id']
            new_posts[pid] = p

In [342]:

print 'total unique posts that we saw on front page: ', len(front_posts)
print 'and those that also made it to top 10: ', len(front_posts_top10)
print 'total unique posts that we saw on new page: ', len(new_posts)
print 'front page success ratio is %.2f%%' % (len(front_posts)*100.0 / len(new_posts), )
print 'top10 success ratio is %.2f%%' % (len(front_posts_top10)*100.0 / len(new_posts), )

fps = set(front_posts.keys())
nps = set(new_posts.keys())
pos = fps.intersection(nps)
neg = nps.difference(fps)

print 'total number of unique, successful posts: ', len(pos)
print 'total number of failed posts: ', len(neg)

total unique posts that we saw on front page:  4232
and those that also made it to top 10:  2760
total unique posts that we saw on new page:  30891
front page success ratio is 13.70%
top10 success ratio is 8.93%
total number of unique, successful posts:  4030
total number of failed posts:  26861

In [343]:

# lets study effect of domains
from math import log

cp = {}
cn = {}
for pid in pos:
    dom = new_posts[pid]['domain']
    cp[dom] = cp.get(dom, 0.0) + 1
for pid in neg:
    dom = new_posts[pid]['domain']
    cn[dom] = cn.get(dom, 0.0) + 1

alldoms = set(cp.keys()).union(set(cn.keys()))
ratios = []
for dom in alldoms:
    ratios.append((log(cp.get(dom, 0.0)+10.0) - log(cn.get(dom, 0.0)+10.0), dom)) # some smoothing here
    
print 'total number of unique domains: ', len(alldoms)
print '----------'

# now sort and display them
ratios.sort()
for rat in ratios[:30]:
    d = rat[1]
    print 'HN _hates_ domain %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
print '----------'

print 'terrible domains: submitted more than 20 times and nothing got to front page:'
print ', '.join(d for d in cn if cn[d] > 20 and cp.get(d, 0) == 0)

print '----------'
num_printed = 0
for rat in ratios[-1:0:-1]:
    d = rat[1]
    if cp.get(d,0)<=2: continue # never mind these, only one data point to go on
    if num_printed < 40:
        print 'HN _loves_ domain %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
        num_printed += 1

total number of unique domains:  9892
----------
HN _hates_ domain youtube.com. front page 4/553 times
HN _hates_ domain theverge.com. front page 2/252 times
HN _hates_ domain . front page 98/1788 times
HN _hates_ domain businessinsider.com. front page 2/115 times
HN _hates_ domain twitter.com. front page 3/113 times
HN _hates_ domain reddit.com. front page 0/82 times
HN _hates_ domain mashable.com. front page 2/102 times
HN _hates_ domain medium.com. front page 68/683 times
HN _hates_ domain qz.com. front page 2/87 times
HN _hates_ domain gizmodo.com. front page 0/68 times
HN _hates_ domain venturefizz.com. front page 0/67 times
HN _hates_ domain theregister.co.uk. front page 0/65 times
HN _hates_ domain quora.com. front page 1/71 times
HN _hates_ domain vimeo.com. front page 0/62 times
HN _hates_ domain imgur.com. front page 0/61 times
HN _hates_ domain telegraph.co.uk. front page 1/67 times
HN _hates_ domain sdt.bz. front page 0/59 times
HN _hates_ domain github.com. front page 121/917 times
HN _hates_ domain torrentfreak.com. front page 0/45 times
HN _hates_ domain citizentekk.com. front page 0/43 times
HN _hates_ domain usersnap.com. front page 0/43 times
HN _hates_ domain wired.com. front page 36/269 times
HN _hates_ domain play.google.com. front page 1/49 times
HN _hates_ domain cnet.com. front page 14/128 times
HN _hates_ domain dailymail.co.uk. front page 0/41 times
HN _hates_ domain nytimes.com. front page 61/408 times
HN _hates_ domain picktoread.com. front page 0/39 times
HN _hates_ domain cnn.com. front page 15/126 times
HN _hates_ domain wikipedia.org. front page 40/267 times
HN _hates_ domain fastcodesign.com. front page 0/37 times
----------
terrible domains: submitted more than 20 times and nothing got to front page:
sdt.bz, picktoread.com, fastcodesign.com, torrentfreak.com, webkul.com, vimeo.com, strongloop.com, usersnap.com, theregister.co.uk, gizmodo.com, popsci.com, treehugger.com, venturefizz.com, citizentekk.com, dailymail.co.uk, extremetech.com, reddit.com, imgur.com
----------
HN _loves_ domain zactownsend.com. front page 6/7 times
HN _loves_ domain pud.com. front page 3/3 times
HN _loves_ domain povolotski.me. front page 3/3 times
HN _loves_ domain plivo.com. front page 3/3 times
HN _loves_ domain kirigin.com. front page 3/3 times
HN _loves_ domain hslda.org. front page 3/3 times
HN _loves_ domain hireart.com. front page 3/3 times
HN _loves_ domain glebm.com. front page 3/3 times
HN _loves_ domain delian.io. front page 3/3 times
HN _loves_ domain dcurt.is. front page 3/3 times
HN _loves_ domain circleci.com. front page 3/3 times
HN _loves_ domain betanews.com. front page 3/3 times
HN _loves_ domain teslamotors.com. front page 4/5 times
HN _loves_ domain fastmail.fm. front page 4/5 times
HN _loves_ domain docker.io. front page 9/15 times
HN _loves_ domain usvsth3m.com. front page 3/4 times
HN _loves_ domain survata.com. front page 3/4 times
HN _loves_ domain steampowered.com. front page 3/4 times
HN _loves_ domain philipwalton.github.io. front page 3/4 times
HN _loves_ domain mailgun.com. front page 3/4 times
HN _loves_ domain joystiq.com. front page 3/4 times
HN _loves_ domain garrytan.com. front page 3/4 times
HN _loves_ domain dwolla.com. front page 3/4 times
HN _loves_ domain cloudant.com. front page 3/4 times
HN _loves_ domain close.io. front page 3/4 times
HN _loves_ domain chrome.blogspot.com. front page 3/4 times
HN _loves_ domain policymic.com. front page 4/6 times
HN _loves_ domain shellycloud.com. front page 3/5 times
HN _loves_ domain levels.io. front page 3/5 times
HN _loves_ domain globaldev.co.uk. front page 3/5 times
HN _loves_ domain fsf.org. front page 3/5 times
HN _loves_ domain devttys0.com. front page 3/5 times
HN _loves_ domain righto.com. front page 4/7 times
HN _loves_ domain android.com. front page 4/7 times
HN _loves_ domain erratasec.com. front page 6/11 times
HN _loves_ domain 42floors.com. front page 6/11 times
HN _loves_ domain virgin.com. front page 3/6 times
HN _loves_ domain thestringer.com.au. front page 3/6 times
HN _loves_ domain stripe.com. front page 3/6 times
HN _loves_ domain stavros.io. front page 3/6 times

In [345]:

# lets study effect of user now. I'll just copy paste of the above but replace users for domains

from math import log

cp = {}
cn = {}
for pid in pos:
    dom = new_posts[pid]['user']
    cp[dom] = cp.get(dom, 0.0) + 1
for pid in neg:
    dom = new_posts[pid]['user']
    cn[dom] = cn.get(dom, 0.0) + 1

alldoms = set(cp.keys()).union(set(cn.keys()))
ratios = []
for dom in alldoms:
    ratios.append((log(cp.get(dom, 0.0)+5.0) - log(cn.get(dom, 0.0)+5.0), dom)) # some smoothing here
    
print 'total number of unique users: ', len(alldoms)
print '----------'

# now sort and display them
ratios.sort()
for rat in ratios[:20]:
    d = rat[1]
    print 'HN _hates_ user %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))
print '----------'
for rat in ratios[-1:-20:-1]:
    d = rat[1]
    if cp.get(d,0)<=1: continue # never mind these, only one data point to go on
    print 'HN _loves_ user %s. front page %d/%d times' % (d, cp.get(d,0), cp.get(d,0)+cn.get(d,0))

total number of unique users:  11906
----------
HN _hates_ user rjmarvin. front page 0/56 times
HN _hates_ user ghosh. front page 4/98 times
HN _hates_ user ardalzn. front page 0/47 times
HN _hates_ user austengary. front page 13/179 times
HN _hates_ user dbkeohane. front page 0/39 times
HN _hates_ user kdforf. front page 0/39 times
HN _hates_ user jonbaer. front page 19/222 times
HN _hates_ user grexi. front page 0/38 times
HN _hates_ user uladzislau. front page 3/65 times
HN _hates_ user prateekj. front page 7/101 times
HN _hates_ user wslh. front page 4/70 times
HN _hates_ user Baustin. front page 2/52 times
HN _hates_ user skiskilo. front page 0/33 times
HN _hates_ user hkimura. front page 0/32 times
HN _hates_ user mhb. front page 0/32 times
HN _hates_ user tlongren. front page 1/38 times
HN _hates_ user danso. front page 19/181 times
HN _hates_ user taylorbuley. front page 0/29 times
HN _hates_ user vipinsahu. front page 0/28 times
HN _hates_ user RougeFemme. front page 10/104 times
----------
HN _loves_ user jaf12duke. front page 6/6 times
HN _loves_ user zachinglis. front page 4/4 times
HN _loves_ user whoishiring. front page 3/3 times
HN _loves_ user shandip. front page 3/3 times
HN _loves_ user sehugg. front page 3/3 times
HN _loves_ user mittal. front page 3/3 times
HN _loves_ user mikeknoop. front page 3/3 times
HN _loves_ user michaelbuckbee. front page 3/3 times
HN _loves_ user luigi. front page 3/3 times
HN _loves_ user grk. front page 3/3 times
HN _loves_ user grflynn. front page 3/3 times
HN _loves_ user glebm. front page 3/3 times
HN _loves_ user dannypovolotski. front page 3/3 times
HN _loves_ user clarete. front page 3/3 times
HN _loves_ user zainny. front page 2/2 times
HN _loves_ user yesbabyyes. front page 2/2 times
HN _loves_ user xnyhps. front page 2/2 times
HN _loves_ user tyre. front page 2/2 times
HN _loves_ user typpo. front page 2/2 times

In [510]:

# Now lets look at chances of a story to work its way to Front page as a function of time of submission
# we'll use dictionary pidt we made before, which gives pid -> time this post was first seen

# Lets build up two arrays of size 7x24: they will store counts of positive and negative stories
# here, we definite a positive story as a story that reached the front page eventually
# All times are in PST.

arrpos = np.zeros((7, 24))
arrneg = np.zeros((7, 24))
for pid,t in pidt.iteritems():
    dt = datetime.fromtimestamp(t)
    # dt.weekday() gives 0 for Monday, ... 6 for Sunday
    # print t, dt, dt.weekday(), dt.hour
    wk = dt.weekday()
    wt = dt.hour
    if pid in pos:
        arrpos[wk, wt] += 1.0
    else:
        arrneg[wk, wt] += 1.0

In [511]:

# helper function that smooths the array along the day, because
# we don't have THAT much data and this should vary slowly in time
def smooth_days(arr):
    for i in range(7):
        arr[i, :] = np.convolve(arr[i, :], np.array([0.5, 1, 0.5])/2.0, 'same')
smooth_days(arrpos)
smooth_days(arrneg)
arrtot = arrpos + arrneg

In [512]:

print 'basing the following on %d total stories...' % (arrtot.sum())

plt.figure()
plt.imshow(arrtot , interpolation='nearest', cmap='jet')
plt.title('total number submissions for every time slot')
plt.xlabel('hour of day')
plt.ylabel('weekday (0 = Monday)')
plt.figure()
logposratio = np.log(np.divide(arrpos, arrneg))
plt.imshow(logposratio, interpolation='nearest', cmap='jet')
plt.title('positiveness log ratio: log #positives / #negatives per slot')

basing the following on 30504 total stories...

Out[512]:

<matplotlib.text.Text at 0x24be9c90>

In [513]:

print 'total number of submissions per week day, starting with Monday: ', arrtot.sum(axis=1).astype(int)
plt.figure()
plt.bar(np.arange(24), arrtot.sum(axis=0))
plt.title('#posts submitted per hour of the day (PST)')
# poof noon is busy time on HN!

total number of submissions per week day, starting with Monday:  [5481 4842 4560 5448 5115 2097 2959]

Out[513]:

<matplotlib.text.Text at 0x1afcfb90>

In [517]:

# okay, now lets look at the best time to actually submit a story according to this
def my2str(t):
    d,h = t
    return ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'][d] + ', ' + `h` + ':00'

lprcp = np.copy(logposratio)
for k in range(10):
    maxi = lprcp.argmax()
    maxv = lprcp.max()
    topk = np.unravel_index(maxi, lprcp.shape)
    print 'it is great to submit on %s: %d/%d (logratio %.2f) go to front page' % (my2str(topk), arrpos[topk[0], topk[1]], arrneg[topk[0], topk[1]], maxv)
    lprcp[topk[0], topk[1]] = -1000000 # hack... ;p

print '-----------'
lprcp = np.copy(-logposratio)
for k in range(10):
    maxi = lprcp.argmax()
    maxv = lprcp.max()
    topk = np.unravel_index(maxi, lprcp.shape)
    print 'it sucks to submit on %s: %d/%d (logratio %.2f) go to front page' % (my2str(topk), arrpos[topk[0], topk[1]], arrneg[topk[0], topk[1]], -maxv)
    lprcp[topk[0], topk[1]] = -1000000 # hack... ;p

it is great to submit on Sunday, 0:00: 12/40 (logratio -1.18) go to front page
it is great to submit on Saturday, 19:00: 12/45 (logratio -1.28) go to front page
it is great to submit on Sunday, 9:00: 32/118 (logratio -1.29) go to front page
it is great to submit on Sunday, 8:00: 32/119 (logratio -1.30) go to front page
it is great to submit on Saturday, 18:00: 14/53 (logratio -1.31) go to front page
it is great to submit on Saturday, 7:00: 21/80 (logratio -1.33) go to front page
it is great to submit on Sunday, 4:00: 18/69 (logratio -1.34) go to front page
it is great to submit on Sunday, 1:00: 16/63 (logratio -1.37) go to front page
it is great to submit on Saturday, 10:00: 27/109 (logratio -1.39) go to front page
it is great to submit on Sunday, 10:00: 29/118 (logratio -1.40) go to front page
-----------
it sucks to submit on Wednesday, 21:00: 10/97 (logratio -2.28) go to front page
it sucks to submit on Wednesday, 12:00: 25/250 (logratio -2.27) go to front page
it sucks to submit on Tuesday, 16:00: 14/135 (logratio -2.27) go to front page
it sucks to submit on Thursday, 13:00: 27/266 (logratio -2.27) go to front page
it sucks to submit on Thursday, 16:00: 16/159 (logratio -2.27) go to front page
it sucks to submit on Thursday, 14:00: 24/233 (logratio -2.25) go to front page
it sucks to submit on Thursday, 12:00: 33/318 (logratio -2.25) go to front page
it sucks to submit on Wednesday, 20:00: 10/94 (logratio -2.25) go to front page
it sucks to submit on Tuesday, 12:00: 27/261 (logratio -2.24) go to front page
it sucks to submit on Friday, 0:00: 10/95 (logratio -2.23) go to front page

In [533]:

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2, stop_words = 'english', strip_accents = 'unicode', lowercase=True, ngram_range=(1,2))
vectorizer

Out[533]:

CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents='unicode', token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [534]:

pos_titles = [new_posts[pid]['title'] for pid in pos]
neg_titles = [new_posts[pid]['title'] for pid in neg]
X = vectorizer.fit_transform(pos_titles + neg_titles)
y = np.array([2]*len(pos_titles) + [1]*len(neg_titles))
import random
feats = vectorizer.get_feature_names()
print 'dimensions of tfidf sparse matrix (#examples x features): ', X.shape
print 'few random features: ', ', '.join(random.sample(feats, 20))

dimensions of tfidf sparse matrix (#examples x features):  (30891, 23823)
few random features:  wix, cortana, splice, improve performance, hn new, fair love, concordia ill, hs, health, traffic feeds, js simple, cisco, assassination, begins, microcontroller web, husband, instacart, lenovo ceo, instagram pictures, vanish public

In [536]:

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

clf.fit(X, y)

# get classification accuracy for our training set
# this will tell us how easily we can tell if a story will make it to frontpage
# based ONLY on the words in it
yhat = clf.predict(X)
print 'predicting all stories as negative gives baseline accuracy: ', len(neg_titles)*1.0/(len(neg_titles)+len(pos_titles))
print 'naive bayes on train data accuracy: ', np.mean(yhat == y)

predicting all stories as negative gives baseline accuracy:  0.869541290343
naive bayes on train data accuracy:  0.891683661908

In [538]:

# Finally lets look at the features that are most predictive for the positive class
# (in our case if a story makes it from New to Front page)
pf = [(clf.feature_log_prob_[1, i], feats[i]) for i in range(len(feats))]
pf.sort(reverse=True)
for p in pf[:50]:
    print 'Positive word %.2f: %s' % (p[0], p[1])

print ', '.join([x[1] for x in pf[:50]])

Positive word -5.34: hn
Positive word -5.70: new
Positive word -5.95: google
Positive word -6.37: open
Positive word -6.38: startup
Positive word -6.38: nsa
Positive word -6.53: web
Positive word -6.59: free
Positive word -6.60: data
Positive word -6.63: microsoft
Positive word -6.66: apple
Positive word -6.71: app
Positive word -6.73: internet
Positive word -6.76: ask
Positive word -6.78: js
Positive word -6.82: using
Positive word -6.82: don
Positive word -6.86: ask hn
Positive word -6.90: world
Positive word -6.90: time
Positive word -6.92: source
Positive word -6.96: man
Positive word -6.98: make
Positive word -6.98: linux
Positive word -6.98: code
Positive word -7.03: open source
Positive word -7.05: like
Positive word -7.08: use
Positive word -7.08: programming
Positive word -7.10: facebook
Positive word -7.10: android
Positive word -7.13: year
Positive word -7.13: bitcoin
Positive word -7.15: people
Positive word -7.15: github
Positive word -7.15: game
Positive word -7.15: 10
Positive word -7.18: software
Positive word -7.21: years
Positive word -7.21: twitter
Positive word -7.21: security
Positive word -7.21: 000
Positive word -7.23: way
Positive word -7.23: just
Positive word -7.23: iphone
Positive word -7.26: yc
Positive word -7.26: startups
Positive word -7.26: released
Positive word -7.26: 2013
Positive word -7.29: hacker
hn, new, google, open, startup, nsa, web, free, data, microsoft, apple, app, internet, ask, js, using, don, ask hn, world, time, source, man, make, linux, code, open source, like, use, programming, facebook, android, year, bitcoin, people, github, game, 10, software, years, twitter, security, 000, way, just, iphone, yc, startups, released, 2013, hacker