notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
df = pd.read_table("tagged.csv")



In [3]:

    
df.head()









    Out[3]:






  
    
      
      n_comments
      year
      title
      speaker_info
      section
      target_audience
      type
      prerequisites
      description
      speaker_link_present
      content_url_present
      deadlinediff
      selected
    
  
  
    
      0
      0
      2015
      consuming government data python d
      pratap vardhan data scientist gramenercom data...
      data visualization analytics
      intermediate
      talks
      NaN
      the explosion open data especially government ...
      False
      False
      -94
      True
    
    
      1
      1
      2015
      dont get scared get started
      tapasweni pathaki done bachelors it igdtuw i w...
      others
      beginner
      talks
      nothingother passion coding
      opensource world full excitement knowledge enc...
      False
      False
      1
      False
    
    
      2
      0
      2015
      distributed scheduling leveraging multiple nod...
      i software engineer red hat inc working gluste...
      concurrency
      beginner
      talks
      a basic understanding distributed system works...
      setting cron job machine perhaps easiest way s...
      True
      True
      35
      False
    
    
      3
      0
      2015
      analyzing python code pylint
      im open source enthusiast coming romania lead ...
      others
      intermediate
      talks
      the participants basic understanding python no...
      given dynamic nature python bugs tend creep co...
      False
      False
      -112
      False
    
    
      4
      0
      2015
      python  metaprogramming macros madness  more
      suhas data scientist gramener previously engin...
      core python
      intermediate
      talks
      NaN
      summaryever wanted conquer world fell short kn...
      False
      False
      -94
      True



In [4]:

    
df.columns









    Out[4]:





Index([u'n_comments', u'year', u'title', u'speaker_info', u'section',
       u'target_audience', u'type', u'prerequisites', u'description',
       u'speaker_link_present', u'content_url_present', u'deadlinediff',
       u'selected'],
      dtype='object')



In [5]:

    
categoricals = []
for col in "section target_audience type".split():
    categoricals.append(pd.get_dummies(df[col]).values)



In [6]:

    
[x.shape for x in categoricals]









    Out[6]:





[(290, 13), (290, 3), (290, 2)]



In [7]:

    
import numpy as np



In [8]:

    
xCat = np.hstack(categoricals)



In [9]:

    
xCat.shape









    Out[9]:





(290, 18)



In [10]:

    
xBool = df['speaker_link_present content_url_present'.split()].astype(int)



In [11]:

    
xText = df['title speaker_info prerequisites description'.split()].copy()
xText.fillna(value=" ", inplace=True)



In [12]:

    
xText = xText.apply(lambda x: " ".join(x), axis=1)



In [13]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [14]:

    
xText = TfidfVectorizer().fit_transform(xText.values)



In [15]:

    
xNum = df['n_comments deadlinediff'.split()].values



In [16]:

    
feats = [xCat, xBool.values, xText.toarray(), xNum]



In [17]:

    
X = np.hstack(feats)
y = df['selected'].astype(int).values



In [18]:

    
from scipy.io import savemat
savemat("trainData.mat", dict(X=X, y=y))



In [19]:

    
from sklearn.preprocessing import StandardScaler



In [20]:

    
xScaled = StandardScaler().fit_transform(X)



In [21]:

    
from sklearn.linear_model import LogisticRegression
from jeeves.cross_validation import report_stratifiedKfold_cv



In [22]:

    
report_stratifiedKfold_cv(LogisticRegression(), X, y, shuffle=True)









    Out[22]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.830508
      0.307692
    
    
      1
      0.827586
      0.384615
    
    
      2
      0.913793
      0.615385
    
    
      3
      0.810345
      0.538462
    
    
      4
      0.842105
      0.500000



In [23]:

    
from sklearn.svm import SVC



In [24]:

    
report_stratifiedKfold_cv(SVC(), X, y, shuffle=True)









    Out[24]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.864407
      0.615385
    
    
      1
      0.844828
      0.615385
    
    
      2
      0.879310
      0.538462
    
    
      3
      0.913793
      0.692308
    
    
      4
      0.754386
      0.166667



In [25]:

    
x_non_text = [xCat, xBool.values, xNum]



In [26]:

    
x_non_text = np.hstack(x_non_text)



In [27]:

    
x_non_text_scaled = StandardScaler().fit_transform(x_non_text)



In [28]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components=2)



In [29]:

    
x_red = pca.fit_transform(x_non_text_scaled)
plt.figure(figsize=(10, 8))
plt.scatter(x_red[:, 0], x_red[:, 1], c=y)









    Out[29]:





<matplotlib.collections.PathCollection at 0x114030a50>



In [30]:

    
report_stratifiedKfold_cv(LogisticRegression(), x_non_text_scaled, y, shuffle=True)









    Out[30]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.847458
      0.615385
    
    
      1
      0.827586
      0.384615
    
    
      2
      0.844828
      0.692308
    
    
      3
      0.810345
      0.307692
    
    
      4
      0.859649
      0.500000



In [31]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier



In [32]:

    
report_stratifiedKfold_cv(RandomForestClassifier(), x_non_text, y, shuffle=True, n_folds=10)









    Out[32]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.866667
      0.571429
    
    
      1
      0.766667
      0.000000
    
    
      2
      0.866667
      0.428571
    
    
      3
      0.766667
      0.571429
    
    
      4
      0.896552
      0.500000
    
    
      5
      0.896552
      0.833333
    
    
      6
      0.785714
      0.500000
    
    
      7
      0.892857
      0.833333
    
    
      8
      0.964286
      0.833333
    
    
      9
      0.821429
      0.333333



In [33]:

    
report_stratifiedKfold_cv(DecisionTreeClassifier(), x_non_text, y, shuffle=True, n_folds=10)









    Out[33]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.833333
      0.428571
    
    
      1
      0.900000
      0.714286
    
    
      2
      0.966667
      1.000000
    
    
      3
      0.966667
      1.000000
    
    
      4
      0.965517
      0.833333
    
    
      5
      0.896552
      0.833333
    
    
      6
      0.821429
      0.666667
    
    
      7
      0.928571
      0.666667
    
    
      8
      0.928571
      0.833333
    
    
      9
      1.000000
      1.000000



In [34]:

    
print report_stratifiedKfold_cv(RandomForestClassifier(), x_non_text, y, shuffle=True, n_folds=10).mean()
print report_stratifiedKfold_cv(RandomForestClassifier(), X, y, shuffle=True, n_folds=10).mean()









    



accuracy_score    0.861938
recall_score      0.578571
dtype: float64
accuracy_score    0.786486
recall_score      0.076190
dtype: float64



In [35]:

    
print report_stratifiedKfold_cv(DecisionTreeClassifier(), x_non_text, y, shuffle=True, n_folds=10).mean()
print report_stratifiedKfold_cv(DecisionTreeClassifier(), X, y, shuffle=True, n_folds=10).mean()









    



accuracy_score    0.927635
recall_score      0.828571
dtype: float64
accuracy_score    0.895747
recall_score      0.745238
dtype: float64



In [36]:

    
from sklearn.grid_search import GridSearchCV
param_grid = dict(
    criterion="gini entropy".split(), splitter="best random".split(),
    max_features=[None, "auto", "sqrt", "log2"],
    class_weight=["balanced", None],
    presort=[True, False]
)
clf = DecisionTreeClassifier()
from sklearn.metrics import recall_score
gcv = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring="accuracy")



In [37]:

    
gcv.fit(x_non_text, y)









    Out[37]:





GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'presort': [True, False], 'max_features': [None, 'auto', 'sqrt', 'log2'], 'class_weight': ['balanced', None], 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)



In [39]:

    
gcv.best_params_









    Out[39]:





{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_features': None,
 'presort': False,
 'splitter': 'best'}



In [40]:

    
clf = gcv.best_estimator_



In [42]:

    
report_stratifiedKfold_cv(clf, x_non_text, y, n_folds=10)









    Out[42]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.933333
      1.000000
    
    
      1
      0.900000
      0.571429
    
    
      2
      0.933333
      0.857143
    
    
      3
      0.900000
      0.857143
    
    
      4
      0.896552
      0.666667
    
    
      5
      0.965517
      0.833333
    
    
      6
      0.928571
      0.666667
    
    
      7
      0.892857
      0.833333
    
    
      8
      0.928571
      0.833333
    
    
      9
      0.928571
      0.833333



In [43]:

    
from sklearn.tree import export_graphviz



In [44]:

    
with open("cfp_classifier.dot", "w") as f_out:
    export_graphviz(clf, out_file=f_out)



In [ ]:

    
from IPython.display import Image  
dot_data = StringIO()  
export_graphviz(clf, out_file=dot_data,  
                feature_names=iris.feature_names,  
                class_names=iris.target_names,  
                filled=True, rounded=True,  
                special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

	n_comments	year	title	speaker_info	section	target_audience	type	prerequisites	description	speaker_link_present	content_url_present	deadlinediff	selected
0	0	2015	consuming government data python d	pratap vardhan data scientist gramenercom data...	data visualization analytics	intermediate	talks	NaN	the explosion open data especially government ...	False	False	-94	True
1	1	2015	dont get scared get started	tapasweni pathaki done bachelors it igdtuw i w...	others	beginner	talks	nothingother passion coding	opensource world full excitement knowledge enc...	False	False	1	False
2	0	2015	distributed scheduling leveraging multiple nod...	i software engineer red hat inc working gluste...	concurrency	beginner	talks	a basic understanding distributed system works...	setting cron job machine perhaps easiest way s...	True	True	35	False
3	0	2015	analyzing python code pylint	im open source enthusiast coming romania lead ...	others	intermediate	talks	the participants basic understanding python no...	given dynamic nature python bugs tend creep co...	False	False	-112	False
4	0	2015	python metaprogramming macros madness more	suhas data scientist gramener previously engin...	core python	intermediate	talks	NaN	summaryever wanted conquer world fell short kn...	False	False	-94	True

	accuracy_score	recall_score
0	0.830508	0.307692
1	0.827586	0.384615
2	0.913793	0.615385
3	0.810345	0.538462
4	0.842105	0.500000

	accuracy_score	recall_score
0	0.864407	0.615385
1	0.844828	0.615385
2	0.879310	0.538462
3	0.913793	0.692308
4	0.754386	0.166667

	accuracy_score	recall_score
0	0.847458	0.615385
1	0.827586	0.384615
2	0.844828	0.692308
3	0.810345	0.307692
4	0.859649	0.500000

	accuracy_score	recall_score
0	0.866667	0.571429
1	0.766667	0.000000
2	0.866667	0.428571
3	0.766667	0.571429
4	0.896552	0.500000
5	0.896552	0.833333
6	0.785714	0.500000
7	0.892857	0.833333
8	0.964286	0.833333
9	0.821429	0.333333

	accuracy_score	recall_score
0	0.833333	0.428571
1	0.900000	0.714286
2	0.966667	1.000000
3	0.966667	1.000000
4	0.965517	0.833333
5	0.896552	0.833333
6	0.821429	0.666667
7	0.928571	0.666667
8	0.928571	0.833333
9	1.000000	1.000000

	accuracy_score	recall_score
0	0.933333	1.000000
1	0.900000	0.571429
2	0.933333	0.857143
3	0.900000	0.857143
4	0.896552	0.666667
5	0.965517	0.833333
6	0.928571	0.666667
7	0.892857	0.833333
8	0.928571	0.833333
9	0.928571	0.833333