In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.grid_search import GridSearchCV
from IPython.display import Image
pd.set_option('chained_assignment', None)
plt.style.use('ggplot')
plt.rc('xtick.major', size=0)
plt.rc('ytick.major', size=0)

In [3]:
user_tags = pd.read_csv("user_tags_merge.csv")

In [4]:
user_tags


Out[4]:
user_id user_name gender_male airport animal apple auto autograph autumn baby ... tree vatican vegetable violin volleyball water waterfall wedding wine winter
0 963865524 e34_1023 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1 231764687 kojita_na 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 35262468 kanacom02 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.689974 0.000000 ... 0.622459 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
3 12889622 sato_charlotte 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.890903 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 1338792479 nkmrerk 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.975755 0.000000 ... 0.475021 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
5 632296001 yukamoumoon 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.549834 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
6 204010752 keeeei_t 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.890903
7 1441642315 yukako0924 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
8 472108366 keikinoshi 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
9 1457971013 mymt_yk 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.425557 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.908877 0.000000 0.000000
10 341181859 hino6x9 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
11 1433525330 nm.kyoki 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.425557
12 1614214864 14aryib_5 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
13 443212411 710rumi 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
14 20674860 accorone 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.5 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
15 14068459 anarchistraw 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.802184 0.000000
16 3710300 ant_62 1 0.000000 0.000000 0.000000 0.000000 0.689974 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
17 1391844497 ayaka.88 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
18 1618281134 ayakaaaaa1002 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
19 1631841910 bubupig16 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.524979 0.000000 0.000000
20 1591441193 emikokatsumata 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.377541 0.000000 0.000000 0.000000 0.000000
21 1345412468 er1i18 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 1.798109 0.000000 0.000000
22 249005215 hodaka_t 1 0.689974 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
23 1542515338 k___s___m 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
24 307131495 kana_co_kana 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25 1356904794 kanet1027 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
26 242340652 kishiasa 0 0.000000 0.869892 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
27 331584585 kuruton4423 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
28 1449505380 maiko_totoro 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
29 28551727 mizuking1 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
30 1717694344 mm_knk823 0 0.000000 0.000000 0.000000 0.817574 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
31 1628779627 murakamigram 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.731059 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
32 1552060948 nene_uehara 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
33 1685189231 nyu3uki 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
34 992448809 ochan1227 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
35 1650824134 okuchan551204 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.785835 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
36 1491616121 polocco19 0 0.000000 0.000000 0.598688 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.645656 0.000000 0.000000 0.000000 0.000000 0.000000
37 21034569 ricktocaster 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
38 1588713355 saaya24v_ 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
39 181680157 sao_tason 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
40 31106041 sbytmk 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
41 1575747845 segawa_ayaka 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
42 197588060 sejusonia_new 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
43 1528196058 shimpeterrr 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
44 802303590 shioringo_k 0 0.000000 0.000000 0.000000 0.817574 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
45 1108233901 shokosmo 0 0.000000 0.987872 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
46 572019031 slrymn 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.998499 0.000000 0.000000
47 1507811323 syo_7tri 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
48 12444261 taikin1015 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 1.475873 0.000000 0.000000 0.000000 0.000000 0.000000
49 240997870 takagishingo 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.645656 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50 610808878 takara0626 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
51 1500745758 takurafu 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.268941 0.000000 0.000000 0.000000
52 1476912852 tomshir13 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
53 51128075 uyeda612 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
54 709439195 whistle9 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
55 1302120936 ymuta 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
56 568710865 yohhatu 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
57 1315136665 yugaharada 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
58 1743972349 yuikotaniguchi 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
59 1548402109 zashikane 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.924142 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

60 rows × 144 columns


In [6]:
X = user_tags[['nail', 'person', 'sport', 'food','hair', 'wedding']]
X['mix'] = X['hair'] + X['nail'] + X['wedding']
X.tail()


Out[6]:
nail person sport food hair wedding mix
55 0 5.406591 0 5.783906 0 0 0
56 0 0.999590 0 8.565152 0 0 0
57 0 2.848325 0 5.342605 0 0 0
58 0 8.415150 0 1.440234 0 0 0
59 0 0.000000 0 0.000000 0 0 0

In [7]:
y = user_tags['gender_male']

In [8]:
X=X.drop(['nail', 'sport','hair', 'wedding','mix'], axis=1)

In [9]:
X.tail()


Out[9]:
person food
55 5.406591 5.783906
56 0.999590 8.565152
57 2.848325 5.342605
58 8.415150 1.440234
59 0.000000 0.000000

In [10]:
np.random.seed = 0

xmin, xmax = -2, 12
ymin, ymax = -2, 17

index_male = y[y==1].index
index_female = y[y==0].index

fig, ax = plt.subplots()
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
sc = ax.scatter(X.loc[index_male, 'food'],
                X.loc[index_male, 'person']+(np.random.rand(len(index_male))-0.5)*0.1,
                color='b', label='male', alpha=0.3)
sc = ax.scatter(X.loc[index_female, 'food'],
                X.loc[index_female, 'person']+(np.random.rand(len(index_female))-0.5)*0.1,
                color='r', label='female', alpha=0.3)
ax.set_xlabel('food')
ax.set_ylabel('person')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.legend(bbox_to_anchor=(1.4, 1.03))
plt.show()



In [11]:
X = user_tags[['nail', 'person', 'sport', 'food','coffee','cake','beer','sky']]
y = user_tags["gender_male"]

In [12]:
clf = LogisticRegression()
def cross_val(clf, X, y, K, random_state=0):
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=cv)
    return scores
scores = cross_val(clf, X, y, 6)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))


Scores: [ 0.3  0.6  0.6  0.2  0.7  0.9]
Mean Score: 0.550 (+/-0.473)

In [13]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=2)
scores = cross_val(clf, X, y, 5)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))


Scores: [ 0.33333333  0.5         0.58333333  0.25        0.66666667]
Mean Score: 0.467 (+/-0.309)

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [271]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [15]:
clf = SVC(kernel='rbf', C=100)
X = user_tags[['nail', 'person', 'sport', 'food','coffee','wedding','cake','beer']]
y = user_tags["gender_male"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)
clf.fit(X_train, y_train)


Out[15]:
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
clf.predict(X_val)


Out[16]:
array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1])

In [17]:
y_val
clf.score(X_val, y_val)


Out[17]:
0.53333333333333333

In [18]:
XX = user_tags
XX.tail()


Out[18]:
user_id user_name gender_male airport animal apple auto autograph autumn baby ... tree vatican vegetable violin volleyball water waterfall wedding wine winter
55 1302120936 ymuta 1 0 0 0 0 0 0 0 ... 0 0.000000 0 0 0 0 0 0 0 0
56 568710865 yohhatu 1 0 0 0 0 0 0 0 ... 0 0.000000 0 0 0 0 0 0 0 0
57 1315136665 yugaharada 1 0 0 0 0 0 0 0 ... 0 0.000000 0 0 0 0 0 0 0 0
58 1743972349 yuikotaniguchi 0 0 0 0 0 0 0 0 ... 0 0.000000 0 0 0 0 0 0 0 0
59 1548402109 zashikane 0 0 0 0 0 0 0 0 ... 0 0.924142 0 0 0 0 0 0 0 0

5 rows × 144 columns


In [19]:
XX=XX.drop(['user_id', 'user_name','gender_male'], axis=1)

In [20]:
X


Out[20]:
nail person sport food coffee wedding cake beer
0 0.000000 1.221147 0.354344 3.023121 0.000000 0.000000 0.000000 0.574443
1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 0.000000 0.999325 0.000000 5.917359 0.000000 0.000000 0.987872 0.622459
3 0.000000 3.479643 0.954271 3.609580 0.000000 0.000000 0.000000 0.354344
4 0.000000 0.994514 0.710949 3.017611 0.000000 0.000000 0.000000 0.000000
5 0.000000 5.610765 0.845535 1.147438 0.000000 0.000000 0.000000 0.000000
6 0.000000 0.000000 0.000000 2.969436 0.000000 0.000000 0.000000 0.000000
7 0.000000 12.782320 1.485762 0.964429 0.000000 0.000000 0.000000 0.000000
8 0.000000 0.000000 0.401312 0.000000 0.000000 0.000000 0.000000 0.000000
9 0.983697 5.949989 0.000000 5.100712 0.000000 0.908877 0.000000 0.000000
10 0.998641 9.586854 0.000000 0.500000 0.000000 0.000000 0.000000 0.000000
11 0.000000 4.975507 0.937027 0.524979 0.000000 0.000000 0.000000 0.000000
12 0.993940 7.850416 0.858149 1.450083 0.000000 0.000000 0.000000 1.349848
13 0.000000 0.000000 0.000000 0.731059 0.000000 0.000000 0.000000 0.000000
14 0.000000 0.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
15 0.000000 6.730825 0.000000 0.500000 0.000000 0.000000 0.000000 0.000000
16 0.000000 6.847312 0.000000 0.900249 0.000000 0.000000 0.000000 0.000000
17 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
18 0.000000 5.983299 0.000000 0.890903 0.000000 0.000000 0.000000 0.000000
19 0.000000 7.524030 0.401312 4.051224 0.000000 0.524979 0.000000 0.000000
20 0.000000 0.999629 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
21 0.942676 13.333723 0.000000 1.589144 0.668188 1.798109 0.890903 0.000000
22 0.000000 15.090081 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
23 0.000000 9.730203 0.598688 0.973403 0.000000 0.000000 0.000000 0.000000
24 0.000000 3.894333 2.205813 3.523393 0.000000 0.000000 0.000000 0.000000
25 0.000000 0.354344 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
26 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
27 0.000000 0.000000 0.000000 2.387596 0.000000 0.000000 0.000000 1.376627
28 0.000000 4.701673 0.000000 10.421192 0.000000 0.000000 0.845535 0.000000
29 0.000000 8.992919 0.000000 3.968250 0.000000 0.000000 0.000000 0.000000
30 0.000000 11.040161 0.000000 6.431471 0.000000 0.000000 0.000000 0.000000
31 0.000000 1.567051 0.668188 5.477984 0.000000 0.000000 0.000000 0.964429
32 0.000000 4.487068 1.684352 0.000000 0.000000 0.000000 0.000000 0.000000
33 0.000000 7.838181 0.000000 0.710949 0.000000 0.000000 0.000000 0.000000
34 0.000000 0.000000 0.000000 7.610730 1.848502 0.000000 0.000000 0.000000
35 0.000000 8.680740 1.440737 3.002256 0.000000 0.000000 0.000000 0.000000
36 0.000000 9.772715 0.710949 1.458711 0.000000 0.000000 0.000000 0.000000
37 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
38 0.000000 0.995033 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
39 0.000000 8.957856 0.000000 2.842150 0.000000 0.000000 0.000000 0.000000
40 0.000000 3.959355 0.450166 0.000000 0.000000 0.000000 0.000000 0.000000
41 0.000000 4.892234 0.000000 0.574443 0.000000 0.000000 0.000000 0.000000
42 0.000000 6.940801 0.000000 2.548225 0.000000 0.000000 0.000000 1.243546
43 0.000000 8.980636 0.000000 1.582683 0.475021 0.000000 0.000000 0.622459
44 0.000000 2.868846 0.000000 5.031718 0.000000 0.000000 0.000000 0.598688
45 0.000000 0.000000 0.000000 3.557240 0.000000 0.000000 0.000000 1.456837
46 0.000000 4.994834 0.000000 1.524979 0.000000 0.998499 0.000000 0.000000
47 0.000000 0.000000 0.750260 2.789011 0.000000 0.000000 0.000000 0.000000
48 0.000000 3.187645 2.600877 0.401312 0.000000 0.000000 0.000000 0.000000
49 0.000000 0.832018 1.317574 1.181225 0.377541 0.000000 0.000000 0.000000
50 0.000000 0.996316 0.401312 2.388283 0.000000 0.000000 0.000000 0.000000
51 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
52 0.000000 0.289050 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
53 0.000000 1.271204 0.768525 5.335201 0.450166 0.000000 0.000000 0.000000
54 0.000000 0.310026 0.000000 2.949051 1.424444 0.000000 0.000000 0.598688
55 0.000000 5.406591 0.000000 5.783906 0.000000 0.000000 0.000000 0.000000
56 0.000000 0.999590 0.000000 8.565152 0.000000 0.000000 0.000000 0.750260
57 0.000000 2.848325 0.000000 5.342605 0.000000 0.000000 0.000000 0.000000
58 0.000000 8.415150 0.000000 1.440234 0.000000 0.000000 0.817574 0.000000
59 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

In [21]:
XX_train, XX_val, y_train, y_val = train_test_split(XX, y, train_size=0.5, random_state=1)

In [22]:
clf = LogisticRegression()
def cross_val(clf, XX, y, K, random_state=0):
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, XX, y, cv=cv)
    return scores
scores = cross_val(clf, XX, y, 3)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))


Scores: [ 0.55  0.7   0.8 ]
Mean Score: 0.683 (+/-0.205)

In [5]:
X = user_tags[['nail','hair', 'person', 'sport', 'food','night','coffee','wedding','cake','beer', 'dog', 'animal', 'tree','blossom','cat', 'flower','sky','nature','cherry']]
y = user_tags["gender_male"]

In [6]:
X['animal']=X['animal']+X['dog']+X['cat']
X['cosme']=X['hair']+X['nail']
X['nature']=X['nature']+X['sky']+X['flower']+X['tree']+X['blossom']+X['cherry']
X = X.drop(['nail','hair', 'dog', 'cat', 'sky','flower','tree','blossom','cherry'],axis=1)

In [7]:
X.tail()


Out[7]:
person sport food night coffee wedding cake beer animal nature cosme
55 5.406591 0 5.783906 0 0 0 0.000000 0.00000 0 0.000000 0
56 0.999590 0 8.565152 0 0 0 0.000000 0.75026 0 0.802184 0
57 2.848325 0 5.342605 0 0 0 0.000000 0.00000 0 0.000000 0
58 8.415150 0 1.440234 0 0 0 0.817574 0.00000 0 0.000000 0
59 0.000000 0 0.000000 0 0 0 0.000000 0.00000 0 2.057006 0

In [8]:
clf = LogisticRegression()
def cross_val(clf, X, y, K, random_state=0):
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=cv)
    return scores
for i in range(2,12):
    scores = cross_val(clf, X, y, i)
    print(i)
    print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))


2
Scores: [ 0.6  0.5]
Mean Score: 0.550 (+/-0.100)
3
Scores: [ 0.6   0.65  0.75]
Mean Score: 0.667 (+/-0.125)
4
Scores: [ 0.66666667  0.6         0.33333333  0.8       ]
Mean Score: 0.600 (+/-0.340)
5
Scores: [ 0.58333333  0.58333333  0.58333333  0.41666667  0.83333333]
Mean Score: 0.600 (+/-0.267)
6
Scores: [ 0.5  0.5  0.6  0.2  0.8  0.8]
Mean Score: 0.567 (+/-0.411)
7
Scores: [ 0.55555556  0.66666667  0.33333333  0.66666667  0.875       0.75        0.75      ]
Mean Score: 0.657 (+/-0.321)
8
Scores: [ 0.625       0.625       0.5         0.625       0.57142857  0.85714286
  0.71428571  0.71428571]
Mean Score: 0.654 (+/-0.202)
9
Scores: [ 0.57142857  0.71428571  0.42857143  0.57142857  0.57142857  0.71428571
  0.66666667  1.          0.66666667]
Mean Score: 0.656 (+/-0.297)
10
Scores: [ 0.5         0.66666667  0.83333333  0.33333333  0.66666667  0.33333333
  0.83333333  0.66666667  1.          0.66666667]
Mean Score: 0.650 (+/-0.407)
11
Scores: [ 0.5         0.66666667  0.83333333  0.33333333  0.66666667  0.4         0.6
  1.          0.6         0.8         0.8       ]
Mean Score: 0.655 (+/-0.377)

In [29]:
clf = SVC(kernel='rbf', C=1000)
for i in range(2,12):
    scores = cross_val(clf, X, y, i)
    print(i)
    print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))


2
Scores: [ 0.46666667  0.46666667]
Mean Score: 0.467 (+/-0.000)
3
Scores: [ 0.45  0.5   0.55]
Mean Score: 0.500 (+/-0.082)
4
Scores: [ 0.4         0.4         0.26666667  0.53333333]
Mean Score: 0.400 (+/-0.189)
5
Scores: [ 0.25        0.58333333  0.41666667  0.41666667  0.75      ]
Mean Score: 0.483 (+/-0.340)
6
Scores: [ 0.2  0.6  0.5  0.5  0.7  0.7]
Mean Score: 0.533 (+/-0.340)
7
Scores: [ 0.22222222  0.55555556  0.44444444  0.55555556  0.625       0.5         0.625     ]
Mean Score: 0.504 (+/-0.259)
8
Scores: [ 0.25        0.5         0.5         0.5         0.57142857  0.71428571
  0.57142857  0.57142857]
Mean Score: 0.522 (+/-0.245)
9
Scores: [ 0.28571429  0.42857143  0.42857143  0.28571429  0.57142857  0.42857143
  0.5         0.83333333  0.5       ]
Mean Score: 0.474 (+/-0.311)
10
Scores: [ 0.16666667  0.33333333  0.66666667  0.5         0.33333333  0.5         0.5
  0.5         0.83333333  0.5       ]
Mean Score: 0.483 (+/-0.348)
11
Scores: [ 0.16666667  0.33333333  0.66666667  0.5         0.33333333  0.6         0.4
  0.8         0.6         0.6         0.6       ]
Mean Score: 0.509 (+/-0.348)

In [30]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=2)
for i in range(2,12):
    scores = cross_val(clf, X, y, i)
    print(i)
    print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))


2
Scores: [ 0.6  0.4]
Mean Score: 0.500 (+/-0.200)
3
Scores: [ 0.5   0.45  0.4 ]
Mean Score: 0.450 (+/-0.082)
4
Scores: [ 0.46666667  0.46666667  0.33333333  0.53333333]
Mean Score: 0.450 (+/-0.145)
5
Scores: [ 0.33333333  0.41666667  0.58333333  0.41666667  0.58333333]
Mean Score: 0.467 (+/-0.200)
6
Scores: [ 0.4  0.6  0.4  0.4  0.7  0.6]
Mean Score: 0.517 (+/-0.243)
7
Scores: [ 0.44444444  0.55555556  0.44444444  0.55555556  0.25        0.75        0.625     ]
Mean Score: 0.518 (+/-0.293)
8
Scores: [ 0.5         0.625       0.25        0.625       0.85714286  0.42857143
  0.85714286  0.57142857]
Mean Score: 0.589 (+/-0.384)
9
Scores: [ 0.42857143  0.71428571  0.28571429  0.57142857  0.71428571  0.28571429
  0.83333333  0.83333333  0.66666667]
Mean Score: 0.593 (+/-0.404)
10
Scores: [ 0.33333333  0.66666667  0.83333333  0.33333333  0.83333333  0.5         0.5
  0.83333333  0.5         0.66666667]
Mean Score: 0.600 (+/-0.371)
11
Scores: [ 0.33333333  0.66666667  0.83333333  0.33333333  0.83333333  0.8         0.6
  0.4         0.8         0.2         0.8       ]
Mean Score: 0.600 (+/-0.457)

In [11]:
from sklearn.externals import joblib

In [9]:
clf = LogisticRegression()
clf.fit(X,y)


Out[9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [12]:
joblib.dump(clf, 'clf.pkl')


Out[12]:
['clf.pkl', 'clf.pkl_01.npy', 'clf.pkl_02.npy', 'clf.pkl_03.npy']

In [35]:
X.tail()


Out[35]:
person sport food night coffee wedding cake beer animal nature cosme
55 5.406591 0 5.783906 0 0 0 0.000000 0.00000 0 0.000000 0
56 0.999590 0 8.565152 0 0 0 0.000000 0.75026 0 0.802184 0
57 2.848325 0 5.342605 0 0 0 0.000000 0.00000 0 0.000000 0
58 8.415150 0 1.440234 0 0 0 0.817574 0.00000 0 0.000000 0
59 0.000000 0 0.000000 0 0 0 0.000000 0.00000 0 2.057006 0

In [ ]: