In [6]:
import pandas as pd
import fastparquet
from gensim.models.wrappers import FastText
import csv
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
import re
import string
In [2]:
pd.options.mode.chained_assignment = None
In [3]:
# load data from precomputed dataframe
pfile = fastparquet.ParquetFile('5col_DFrame.parq')
df = pfile.to_pandas() # all columns
#df2 = pfile.to_pandas(columns=['floats', 'times']) # pick some columns
In [4]:
df.tail()
Out[4]:
In [7]:
# write data to txt
dftrain = pd.DataFrame()
#dftrain.loc[:, 'data'] = "__" + df.loc[:,'sentiment'] + "__" + " " + df.loc[:,'text_clean1']
dftrain.loc[:, 'data'] = "__label__" + df.loc[:,'sentiment'] + " " + df.loc[:,'text_clean1']
with open('train.txt', 'w') as f:
dftrain.iloc[0:10000].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')
with open('test.txt', 'w') as f:
dftrain.iloc[10000:].to_csv(f, header=None, index=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar='')
In [8]:
###FAST TEXT###
import fasttext
classifier = fasttext.supervised('train.txt', 'model')
In [ ]:
result = classifier.test('test.txt')
print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('Number of examples:', result.nexamples)
In [ ]:
texts = ['motherfucking', 'I really like what senator mccain said about the future of our great nation']
labels = classifier.predict(texts)
print(labels)