notebook.community

Edit and run



In [106]:

    
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.









    



/kaggle/input/train.csv
/kaggle/input/test.csv



In [107]:

    
df_train = pd.read_csv('/kaggle/input/train.csv')
df_test = pd.read_csv('/kaggle/input/test.csv')
df_train.shape









    Out[107]:





(48192, 3)



In [108]:

    
df_test.head(2)









    Out[108]:







  
    
      
      review
      title
      target
    
  
  
    
      0
      I am from old town, and I stayed in this hotel...
      Incredible Hotel
      5
    
    
      1
      We have been coming to the Ocean Park Inn for ...
      We Love this beach front Inn
      5



In [109]:

    
#df_train = df_train.sample(10000, random_state=0)



In [110]:

    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False)
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(df_train['title'])
#X_train_counts = tf_transformer.fit_transform(X_train_counts)



In [111]:

    
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(random_state=123)
classifier.fit(X_train_counts.toarray(), df_train['target'])









    Out[111]:





ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=123, verbose=0,
                     warm_start=False)



In [112]:

    
from sklearn.metrics import f1_score
f1_score(df_test['target'], classifier.predict(count_vect.transform(df_test['title'])), average='macro')









    Out[112]:





0.4261187748670672

	review	title	target
0	I am from old town, and I stayed in this hotel...	Incredible Hotel	5
1	We have been coming to the Ocean Park Inn for ...	We Love this beach front Inn	5