In [1]:
from IPython.display import Image
Image(filename='pipeline_flow_chart.png')
Out[1]:
In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np
import csv
import re
In [3]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
df.dropna(axis=0,inplace=True)
X = df.diagnosisRAW
In [4]:
class AsciiTransformer(TransformerMixin):
def transform(self,X,**transform_params):
if str(type(X)) != "<class 'pandas.core.series.Series'>":
X = pd.Series(X)
return X.apply(lambda x: x.decode('ISO-8859-2').encode('ASCII','ignore'))
# return pd.Series(X.apply(lambda x: x.decode('ISO-8859-2').encode('ASCII','ignore')))
def fit(self,X,y=None,**fit_params):
return self
t1 = AsciiTransformer().fit_transform(X)
print t1[:10]
print len(t1)
In [5]:
class LowerCaseTransformer(TransformerMixin):
def transform(self,X,**transform_params):
return pd.Series(X.apply(lambda x: x.lower()))
def fit(self,X,y=None,**fit_params):
return self
t2 = LowerCaseTransformer().fit_transform(t1)
print t2[:10]
print len(t2)
In [6]:
class RemoveSymsTransformer(TransformerMixin):
def transform(self, X, **transform_params):
return pd.Series(X.apply(lambda x: re.sub(re.compile(r'[^A-za-z0-9\s\.]'),' ',x)))
def fit(self, X, y=None, **fit_params):
return self
t3 = RemoveSymsTransformer().fit_transform(t2)
print t3[:10]
print len(t3)
In [7]:
class RemoveStopWordsTransformer(TransformerMixin):
def transform(self, X, **transform_params):
with open('pt_stop_words.txt','rb') as f:
stop = []
reader = csv.reader(f)
for word in reader:
word = word[0].split()[0].decode('ISO-8859-2').encode('ASCII','ignore')
if word == '':
pass
else:
stop.append(word)
return pd.Series(X.apply(lambda x: ' '.join([token for token in x.split() if token not in stop])))
def fit(self, X, y=None, **fit_params):
return self
t4 = RemoveStopWordsTransformer().fit_transform(t3)
empty_index = []
for i,text in enumerate(t4):
if text == '':
# print '!!!!!!!!!!!!',i,text
empty_index.append(i)
print t4[:10]
print len(t4)
In [8]:
class ZikaCounterTransformer(TransformerMixin):
def transform(self,X,**transform_params):
return pd.Series(X.apply(lambda x: len(re.findall(r'z.{2}a',x)))).values.reshape(-1,1)
def fit(self,X,y=None,**fit_params):
return self
f1 = ZikaCounterTransformer().fit_transform(t4)
print f1[:10]
print len(f1)
In [9]:
class SentimentTransformer(TransformerMixin):
def transform(self,X,**transform_params):
matrix = []
sa = SentimentIntensityAnalyzer()
for text in X:
senti = sa.polarity_scores(text)
matrix.append([senti['pos'],senti['neu'],senti['neg'],senti['compound']])
return pd.DataFrame(data=matrix,columns=['positive','neutral','negative','compound']).values
def fit(self,X,y=None,**fit_params):
return self
f2 = SentimentTransformer().fit_transform(t4)
print f2[:10]
print len(f2)
In [18]:
f3A = TfidfVectorizer().fit_transform(t4)
print f3A[:2]
print f3A.shape
In [11]:
f3B = TruncatedSVD(100).fit_transform(f3A)
print f3B[:2]
In [12]:
from sklearn.pipeline import make_pipeline,make_union
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
In [13]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
df.dropna(axis=0,inplace=True)
X = df.diagnosisRAW
y = df.zika
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
In [14]:
pipeline = make_pipeline(AsciiTransformer(),
LowerCaseTransformer(),
RemoveSymsTransformer(),
RemoveStopWordsTransformer(),
TfidfVectorizer(), # LSA only
TruncatedSVD(500),
GaussianNB())
model = pipeline.fit(X_train,y_train)
print 'Prediction:', model.predict('eu tenho o virus zika')
print 'Score:', model.score(X_test,y_test)
In [15]:
pipeline = make_pipeline(AsciiTransformer(),
LowerCaseTransformer(),
RemoveSymsTransformer(),
RemoveStopWordsTransformer(),
ZikaCounterTransformer(), # zika counter only
GaussianNB())
model = pipeline.fit(X_train,y_train)
print 'Prediction:', model.predict('eu tenho o virus zika')
print 'Score:', model.score(X_test,y_test)
In [16]:
pipeline = make_pipeline(AsciiTransformer(),
LowerCaseTransformer(),
RemoveSymsTransformer(),
RemoveStopWordsTransformer(),
SentimentTransformer(), # sentiment only
GaussianNB())
model = pipeline.fit(X_train,y_train)
print 'Prediction:', model.predict('eu tenho o virus zika')
print 'Score:', model.score(X_test,y_test)
In [17]:
lsa = make_pipeline(TfidfVectorizer(),TruncatedSVD(500))
feature_union = make_union(lsa,ZikaCounterTransformer(), SentimentTransformer())
pipeline = make_pipeline(AsciiTransformer(),
LowerCaseTransformer(),
RemoveSymsTransformer(),
RemoveStopWordsTransformer(),
feature_union,
GaussianNB())
model = pipeline.fit(X_train,y_train)
print 'Prediction:', model.predict('eu tenho o virus zika')
print 'Score:', model.score(X_test,y_test)