In [3]:
"""Fit SGDClassifier on a Tf-Idf matrix, evaluate it against the validation set
and plot the most and least important words per category"""
%load_ext autoreload
%autoreload 2
from app.evaluation import eval_classifier
from app.training import get_best_text_pipeline, get_undersample_df
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy

train_df = pd.read_csv("data/processed_data.csv")
train_df = get_undersample_df(train_df)
val_df = pd.DataFrame.from_csv("data/validation_data.csv")
y_train = train_df.pop("label")
y_val = val_df.pop("label")

le = LabelEncoder().fit(y_train)
classes = le.classes_

for text_feature in ["readme"]:  
    train_df[text_feature].fillna("", inplace=True)
    val_df[text_feature].fillna("", inplace=True)
    X_train = train_df[text_feature].values
    X_val = val_df[text_feature].values

    ppl = find_best_text_pipeline(X_train, y_train)
    print "Accuracy for {} feature:".format(text_feature)
    acc = eval_classifier(ppl, X_val, y_val, le.classes_)
    print acc


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 15.2min finished
Accuracy for readme feature:
0.516129032258

In [7]:
from app.training import save_pickle
save_pickle(ppl, "best_readme_pipeline_5161")

In [6]:
ppl.named_steps


Out[6]:
{'clf': SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
        learning_rate='optimal', loss='log', n_iter=10, n_jobs=1,
        penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False),
 'tfidf': TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,
          use_idf=True),
 'vect': CountVectorizer(analyzer=<function stemmed_words at 0x7fdd4dea82a8>,
         binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>,
         encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0,
         max_features=None, min_df=1, ngram_range=(1, 2), preprocessor=None,
         stop_words='english', strip_accents=None,
         token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None,
         vocabulary=None)}

In [2]:
val_df = pd.DataFrame.from_csv("data/validation_data.csv")
val_df.fillna("description", inplace=True)
for i, row in val_df.iterrows():
    true = row["label"]
    pred = ppl.predict([row["description"]])
    if true != pred:
        print "{}: label: {}\tpredicted: {} for\n{}".format(row["repository"], true, pred, row["description"])


hpi-swt2-exercise/java-tdd-challenge: label: EDU	predicted: ['DATA'] for
Fork this repository to learn TDD in Java
vhf/free-programming-books: label: DATA	predicted: ['DOCS'] for
:books: Freely available programming books
d3/d3: label: DEV	predicted: ['DATA'] for
Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:
carlosmn/CoMa-II: label: HW	predicted: ['WEB'] for
description
git/git-scm.com: label: DEV	predicted: ['WEB'] for
The git-scm.com website. Note that this repository is only for the website; issues with git itself should go to https://git-scm.com/community.
PowerDNS/pdns: label: DEV	predicted: ['WEB'] for
PowerDNS
cmrberry/cs6300-git-practice: label: HW	predicted: ['WEB'] for
description
mongodb/docs: label: DOCS	predicted: ['WEB'] for
The MongoDB Documentation Project Source.
sindresorhus/eslint-config-xo: label: DEV	predicted: ['WEB'] for
ESLint shareable config for XO
e-books/backbone.en.douceur: label: EDU	predicted: ['WEB'] for
le gros tuto pour apprendre Backbone.js
erikflowers/weather-icons: label: DEV	predicted: ['DOCS'] for
215 Weather Themed Icons and CSS
tensorflow/tensorflow: label: DEV	predicted: ['DOCS'] for
Computation using data flow graphs for scalable machine learning
Chicago/food-inspections-evaluation: label: WEB	predicted: ['DATA'] for
This repository contains the code to generate predictions of critical violations at food establishments in Chicago. It also contains the results of an evaluation of the effectiveness of those predictions.
OpenInstitute/OpenDuka: label: DEV	predicted: ['DATA'] for
Open Duka is a project designed by the Open Institute that will provide a freely accessible database of information on Kenyan entities.   This information will provide citizens, journalists, and civic activists with a practical and easy-to-use tool to understand the ownership structure of the world they live in, demonstrating the practical applications of open information for normal citizens. It will serve as a core dataset for citizens, journalists, and civic activists who want to build 3rd party public transparency and public accountability apps or services, by allowing them to cross reference the Open Duka company shareholder data against other datasets. 
torvalds/linux: label: DEV	predicted: ['WEB'] for
Linux kernel source tree
hughperkins/howto-jenkins-ssl: label: DOCS	predicted: ['DEV'] for
quick how to on activating ssl on jenkins, so I can find it easily

In [3]:
from operator import itemgetter

vect = ppl.named_steps["vect"]
clf = ppl.named_steps["clf"]
tvec = clf.coef_
output = []
for i in range(len(tvec)):
    output.append(classes[i])
    coefs = sorted(zip(tvec[i], vect.get_feature_names()), key=itemgetter(0), reverse=True)
    n = 20
    topn = zip(coefs[:n], coefs[:-(n+1):-1])
    output.append("{:>15}    {: >20}".format("Positive Words", "Negative Words"))
    for (cp, fnp), (cn, fnn) in topn:
        output.append(u"{:0.4f}{: >15}    {:0.4f}{: >15}".format(
            cp, fnp, cn, fnn
            )
    )
    output.append("--------------------")
print "\n".join(output)


DATA
 Positive Words          Negative Words
4.5454        dataset    -3.5198          curat
4.4541           data    -3.3275         websit
3.7519             us    -3.2966         materi
3.5394        countri    -3.2948          upcom
3.4947       catalogu    -3.2948      hackathon
3.4947      exoplanet    -3.1064        footbal
3.4631           name    -3.1063           blog
3.4136          price    -2.6902          spice
3.2948            639    -2.6902        rethink
3.2553       metadata    -2.5849           tool
3.2390          index    -2.5200          cours
3.2304     datapackag    -2.4645        resourc
2.8926            iso    -2.4468       document
2.7931           unit    -2.3884          sourc
2.7893         global    -2.3396          solut
2.7828          natur    -2.3396       homework
2.6872             as    -2.3298           我的博客
2.6724          locat    -2.3298      北邮人常用网址导航
2.4636        classif    -2.3298         我的前端博客
2.4337         govern    -2.3298 siberiawolf的小窝
--------------------
DEV
 Positive Words          Negative Words
4.7322         editor    -5.8304         materi
4.7142      framework    -4.6545        resourc
4.4667          build    -4.1314       document
4.0353         racket    -3.9188           this
3.8595       platform    -3.6836           blog
3.7701         compil    -3.5748          cours
3.4947        simplic    -3.5695           list
3.4281             3d    -3.4491          solut
3.3429        languag    -3.2474         websit
3.2104            vim    -3.1587       homework
3.1636          uniti    -3.1064           site
3.1614          grade    -2.8117             at
3.0895             vr    -2.6902           esri
3.0157      enterpris    -2.6902          spice
2.9875        program    -2.6902             vk
2.9564           game    -2.6417          artsi
2.8443            app    -2.5100           guid
2.6902        rethink    -2.4900         awesom
2.6902         studio    -2.4778           page
2.6893            sdk    -2.3513        collect
--------------------
DOCS
 Positive Words          Negative Words
6.5591         awesom    -4.6121           data
6.3486        resourc    -4.3801           repo
5.9385          curat    -3.7294           blog
5.7068        datadog    -3.7251           open
4.1676        beautif    -2.9625          build
4.1676        typefac    -2.8604         docker
3.9894       document    -2.7479         materi
3.6202          about    -2.6902       metadata
3.4186           guid    -2.6902           rail
3.1750           list    -2.6902           girl
3.0300           book    -2.6902        pattern
2.9925           link    -2.6902        android
2.6517         design    -2.6422       homework
2.6417         hacker    -2.5793        languag
2.6417           news    -2.5038           http
2.3298          cheat    -2.3298recyclerview优秀文集
2.3298        octocat    -2.3298       hellodog
2.3298          sheet    -2.3298           我的博客
2.3298          chess    -2.3298         我的前端博客
2.3255           tool    -2.3298       颜海镜的个人博客
--------------------
EDU
 Positive Words          Negative Words
9.5762         materi    -4.4302       homework
3.4947            lua    -3.4791          solut
3.3248        univers    -2.6902          spark
2.6710         python    -2.5280       document
2.3298       pretoria    -2.4711           blog
2.3298         jqueri    -2.3298 siberiawolf的小窝
2.3298             up    -2.3298          chess
2.1767         lectur    -2.3298         我的前端博客
2.0838      introduct    -2.3298       hellodog
2.0838         novemb    -2.3298recyclerview优秀文集
1.9837          cours    -2.3298       颜海镜的个人博客
1.7612          adequ    -2.2257           list
1.7612             fp    -2.1160           page
1.7612           most    -2.0244             is
1.7612           chen    -2.0177            git
1.7612          cs35l    -1.9467           core
1.7612            liu    -1.9170          engin
1.7612            sec    -1.8656        resourc
1.7612             ta    -1.7690           data
1.7612           ucla    -1.7612        graphql
--------------------
HW
 Positive Words          Negative Words
6.4789       homework    -3.6417         materi
5.5822          solut    -2.7388         github
3.0820         winter    -2.3298 siberiawolf的小窝
2.6417         calcul    -2.3298           蓝色冰火
2.6417         cs193p    -2.3298       颜海镜的个人博客
2.0177            css    -2.3298           我的博客
2.0177           week    -2.3298      北邮人常用网址导航
1.5999          class    -2.3298recyclerview优秀文集
1.4160           html    -2.3298            www
1.3376           2015    -2.3298          chess
1.2769           java    -2.3298       hellodog
1.2495          cours    -2.2102           rail
1.1969           need    -2.2102          handl
1.1969             we    -2.2102          grade
1.0198            100    -2.1818           code
1.0198             64    -2.0838          readi
1.0198             be    -1.8858          simpl
1.0198            can    -1.8291            and
1.0198         should    -1.7787           data
1.0198          there    -1.7612           book
--------------------
WEB
 Positive Words          Negative Words
6.0069         websit    -5.0809         materi
4.1754      hackathon    -4.7557        datadog
4.1482           blog    -4.2567       homework
4.0353          spice    -4.1811        program
3.8811           repo    -3.9345         editor
3.8477         around    -3.8349             to
3.7180           page    -3.7925           game
3.3290           http    -3.5064      framework
3.2948          upcom    -3.4290          about
3.2804           site    -3.3996           data
2.8534         karaok    -3.0300         awesom
2.8534          night    -2.9761           list
2.8534          waffl    -2.9309            and
2.6902             vk    -2.8031         govern
2.6902          spark    -2.8024             in
2.6680         github    -2.7387           time
2.5517        project    -2.7065           flow
2.3298           蓝色冰火    -2.7065          build
2.3298         我的前端博客    -2.6902        rethink
2.3298 siberiawolf的小窝    -2.6902           edit
--------------------

In [ ]: