In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
from matplotlib.pyplot import plot_date
%matplotlib inline
from ggplot import *
hfont = {'fontname':'Arial'}
import pandas as pd
import re
import pickle
import tweepy
from tweepy import OAuthHandler
import json
import collections
In [217]:
def flatten(d, parent_key='', sep='_'):
'''
input: nested dictionary
output: flattened dictionary
'''
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def get_tweet_dataframe(filename):
'''
input: filename (of pickle file)
output: pandas dataframe
function loads pickled array of tweepy status objects, flattens the
nested json structure, appends these to a temporary array, and then
creates a dataframe from the array
'''
tweet_list = []
timeline_tweets = pickle.load(open(filename, "rb"))
for tweet in timeline_tweets:
flat_tweet = flatten(tweet._json)
entity_hashtag = flat_tweet.pop('entities_hashtags',None)
if entity_hashtag == []:
flat_tweet['hashtag'] = []
else:
ht_list = []
for ht in entity_hashtag:
ht_list.append(ht['text'])
flat_tweet['hashtag'] = ht_list
tweet_list.append(flat_tweet)
tweet_dict = pd.DataFrame(tweet_list)
return tweet_dict
In [215]:
'''
Get tweets from candidates
'''
(consumer_key, consumer_secret, access_token, access_secret) = pickle.load(open("auth_secrets.p", "rb"))
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
'''
Initialize arrays of tweets
'''
HC_timeline_tweets = []
DT_timeline_tweets = []
for tweet in tweepy.Cursor(api.user_timeline,id='HillaryClinton').items():
HC_timeline_tweets.append(tweet)
for tweet in tweepy.Cursor(api.user_timeline,id='realDonaldTrump').items():
DT_timeline_tweets.append(tweet)
In [216]:
pickle.dump(HC_timeline_tweets, open( "hc_tweets2.p", "wb" ) )
pickle.dump( DT_timeline_tweets, open( "dt_tweets2.p", "wb" ) )
In [222]:
# Create tweet dataframes for each of the candidates
#HCdf = get_tweet_dataframe("hc_tweets.p")
#DTdf = get_tweet_dataframe("dt_tweets.p")
HCdf2 = get_tweet_dataframe("hc_tweets2.p")
DTdf2 = get_tweet_dataframe("dt_tweets2.p")
# Pickle the dataframes
#pickle.dump(HCdf, open('HCdf.p', 'wb'))
#pickle.dump(DTdf, open('DTdf.p', 'wb'))
pickle.dump(HCdf2, open('HCdf2.p', 'wb'))
pickle.dump(DTdf2, open('DTdf2.p', 'wb'))
In [2]:
# Load pickled dataframes
HCdf2 = pickle.load(open('HCdf2.p', 'rb'))
DTdf2 = pickle.load(open('DTdf2.p', 'rb'))
In [3]:
# Create new dataframes with just relevant columns
HC = HCdf2[['text','favorite_count','retweet_count','created_at']]
DT = DTdf2[['text','favorite_count','retweet_count','created_at']]
In [4]:
'''
Add new feature columns to HC, DT dataframes
'''
def hr_func(ts):
return ts.hour
def day_func(ts):
return ts.weekday()
def char_length_func(ts):
return len(ts)
def word_length_func(ts):
return len(ts.split(' '))
HC.loc[:,'speaker']='HC'
DT.loc[:,'speaker']='DT'
DT.loc[:,'created_at']=pd.to_datetime(DT.loc[:,'created_at'])
HC.loc[:,'created_at']=pd.to_datetime(HC.loc[:,'created_at'])
DT.loc[:,'hour'] = DT.loc[:,'created_at'].apply(hr_func)
HC.loc[:,'hour'] = HC.loc[:,'created_at'].apply(hr_func)
DT.loc[:,'day'] = DT.loc[:,'created_at'].apply(day_func)
HC.loc[:,'day'] = HC.loc[:,'created_at'].apply(day_func)
DT.loc[:,'char_length'] = DT.loc[:,'text'].apply(char_length_func)
HC.loc[:,'char_length'] = HC.loc[:,'text'].apply(char_length_func)
DT.loc[:,'word_length'] = DT.loc[:,'text'].apply(word_length_func)
HC.loc[:,'word_length'] = HC.loc[:,'text'].apply(word_length_func)
In [5]:
def clean_tweets(text_array):
'''
Function takes out urls, retweets, @users
'''
edited_array = []
for tweet in text_array:
tweet = re.sub('\u\d+',' ',tweet.strip())
tweet = re.sub(r'http\S*',' ',tweet)
tweet = re.sub(r'@\S*',' ',tweet)
tweet = re.sub(r'[rR][tT]',' ',tweet)
edited_array.append(tweet)
return edited_array
In [6]:
HC.loc[:,'etext']=clean_tweets(HC['text'])[:]
DT.loc[:,'etext']=clean_tweets(DT['text'])[:]
In [6]:
# Pickle simple dataframes
pickle.dump(HC, open('HC.p', 'wb'))
pickle.dump(DT, open('DT.p', 'wb'))
In [2]:
# Pickle simple dataframes
HC = pickle.load(open('HC.p', 'rb'))
DT = pickle.load(open('DT.p', 'rb'))
In [7]:
HC.head(1)
Out[7]:
In [4]:
GUN_CONTROL=["gun","assault weapon","semi-automatic","pistol grip","selective fire weapons",
"flash suppressor","high-capacity magazine","strawman purchase","private party transfer",
"federally licensed firearms dealer","gunshow","gunshow loophole","gun control act of 1968",
"firearm owners protection","brady handgun violence prevention","assault weapons ban",
"second ammendment","2nd ammendment","national rifle association"]
ABORTION=["women's right","right to choose","prolife","pro-life",r"roe vs\.? wade",
r"pro-?choice","planned parenthood"]
IMMIGRATION=["[Mm]exic","[Mm]uslim","refugee","borders","wall","undocumented","birthright citizenship"]
CRIMINAL_JUSTICE=["prison reform","criminal sentencing","steep penalties","non-?violent",
"drug offenses","mass incarceration","prison sentence","black lives matter",
"police shooting",r"death penalty|penalties"]
MONEY=["money","budget","jobs","middle class", "wages", "economy","economic","recession",
"employment rate","income gap","minimum wage","tax reform","loopholes","job creation",
"tax cut","bankrupt","Wall Street","wallstreet","rich"]
NATIONAL_DEFENSE=["defense","security","terrorism","[sS]yria","[iI]sis","ISIS", "islamic state",
"[iI]raq","[iI]ran","ground troops","military","nuclear deal", "isolation"]
CLIMATE_CHANGE=["climate", "climate change", "global warming","carbon emitters","carbon emission",
"clean power","climate policy","renewable energy","fossil fuel","coal","gas",
"alternative energy","reduce emission","fossil fuel", "solar panel","solar energy"]
HEALTH_CARE=["insurance","health care","healthcare","doctor","medication","medicine",
"affordable care act","obamacare","insurance premium","insurance cost","healthcare",
"medicaid","medicare","prescription"]
HIGHER_EDUCATION=["college","university education","student loans","gradutes","student debt",
"college expenses","tution","community college","pell grants"]
DIVERSITY=['racism','lgbt','bigot','LGBT','gay','lesbian']
In [5]:
def issue_column(KEYWORD, tweet_array):
new_array = []
for t in tweet_array:
a=0
for g in KEYWORD:
if re.search(g,t):
a=1
new_array.append(a)
return new_array
DT.loc[:,'guncontrol'] = issue_column(GUN_CONTROL,DT['etext'])
DT.loc[:,'abortion'] = issue_column(ABORTION,DT['etext'])
DT.loc[:,'immigration'] = issue_column(IMMIGRATION,DT['etext'])
DT.loc[:,'criminal_justice'] = issue_column(CRIMINAL_JUSTICE,DT['etext'])
DT.loc[:,'money'] = issue_column(MONEY,DT['etext'])
DT.loc[:,'national_defense'] = issue_column(NATIONAL_DEFENSE,DT['etext'])
DT.loc[:,'climate_change'] = issue_column(CLIMATE_CHANGE,DT['etext'])
DT.loc[:,'health_care'] = issue_column(HEALTH_CARE,DT['etext'])
DT.loc[:,'higher_education'] = issue_column(HIGHER_EDUCATION,DT['etext'])
DT.loc[:,'diversity'] = issue_column(DIVERSITY,DT['etext'])
HC.loc[:,'guncontrol'] = issue_column(GUN_CONTROL,HC['etext'])
HC.loc[:,'abortion'] = issue_column(ABORTION,HC['etext'])
HC.loc[:,'immigration'] = issue_column(IMMIGRATION,HC['etext'])
HC.loc[:,'criminal_justice'] = issue_column(CRIMINAL_JUSTICE,HC['etext'])
HC.loc[:,'money'] = issue_column(MONEY,HC['etext'])
HC.loc[:,'national_defense'] = issue_column(NATIONAL_DEFENSE,HC['etext'])
HC.loc[:,'climate_change'] = issue_column(CLIMATE_CHANGE,HC['etext'])
HC.loc[:,'health_care'] = issue_column(HEALTH_CARE,HC['etext'])
HC.loc[:,'higher_education'] = issue_column(HIGHER_EDUCATION,HC['etext'])
HC.loc[:,'diversity'] = issue_column(DIVERSITY,HC['etext'])
In [137]:
def create_standard_figure2(df1,y,color):
# set figure size
fig = plt.figure(figsize=(8,3))
# set axes size
ax = fig.add_axes([0.15, 0.1, 0.8, 0.8])
#df1.plot(x='created_at',y=y, ax=ax, color=color)
plot_date(df1['created_at'], df1[y]+.005,color=color,alpha=.3,markersize=24)
#plt.xlim(('1992-01-12','2017-01-15'))
plt.ylim(.99,1.01)
#ax.legend_.remove()
"""
# Legend properties (position, text colors, background color)
legend=ax.legend(loc="center left", bbox_to_anchor=[1,.5])
for color,text in zip(colors,legend.get_texts()):
text.set_color(color)
#text.set_color("grey")
text.set_size("11")
frame = legend.get_frame()
frame.set_facecolor('white')
"""
plt.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
right='off') # ticks along the top edge are of)
ax.set_axis_bgcolor('white')
# Axes borderlines
ax.grid('off')
#ax.grid(axis='x',which='major', color='grey', linestyle='-')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('lightgrey')
ax.tick_params(axis='x',which='minor',bottom='off',top='off',left='off')
ax.tick_params(axis='x',which='major',top='off')
plt.plot(['2015-11-11','2016-06-28'], [1.005, 1.005], color='#348ABD',alpha=.3, linestyle='-', linewidth=.75)
plt.plot(['2015-11-11','2016-06-28'], [1-.005, 1-.005], color='#E24A33', alpha=.3,linestyle='-', linewidth=.75)
plt.annotate('CLINTON', xy=(-1.01,.72),xycoords='axes fraction',horizontalalignment='right',size=14,fontweight="bold",color='#348ABD',**hfont)
plt.annotate('TRUMP', xy=(-1.01,.22),xycoords='axes fraction',horizontalalignment='right',size=14,fontweight="bold",color='#E24A33',**hfont)
#ax.tick_params(axis='y',which='major',right='off')
#ax.tick_params(axis='y',which='major',left='off')
#ax.tick_params(axis='y',which='minor',left='off')
ax.set_xlabel('')
ax.set_yticklabels([])
return fig, ax
In [138]:
fig,ax = create_standard_figure2(HC,'guncontrol','#348ABD')
titlestring="Tweets referring to gun control"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=14,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plt.plot(['2016-06-12','2016-06-12'], [0, 2], color='black', linestyle='-', linewidth=.75)
plt.annotate('Orlando shootings', xy=('2016-06-08',1-.009),horizontalalignment='right', textcoords='data',size=12,color='black',**hfont)
plot_date(DT['created_at'], DT['guncontrol']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('guncontrol.png',dpi=300)
In [139]:
fig,ax = create_standard_figure2(HC,'abortion','#348ABD')
titlestring="Tweets referring to abortion"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
#plt.plot(['2016-06-12','2016-06-12'], [0, 2], color='grey', linestyle='-', linewidth=.75)
#plt.annotate('Orlando shootings', xy=('2016-06-08',1-.009),horizontalalignment='right', textcoords='data',size=12,color='gray',**hfont)
plot_date(DT['created_at'], DT['abortion']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('abortion.png',dpi=300)
In [140]:
fig,ax = create_standard_figure2(HC,'immigration','#348ABD')
titlestring="Tweets referring to immigration"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['immigration']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('immigration.png',dpi=300)
In [141]:
fig,ax = create_standard_figure2(HC,'criminal_justice','#348ABD')
titlestring="Tweets referring to criminal justice"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['criminal_justice']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('criminal_justice.png',dpi=300)
In [142]:
fig,ax = create_standard_figure2(HC,'money','#348ABD')
titlestring="Tweets referring to the economy and jobs"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['money']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('money.png',dpi=300)
In [143]:
fig,ax = create_standard_figure2(HC,'national_defense','#348ABD')
titlestring="Tweets referring to national defense"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['national_defense']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('national_defense.png',dpi=300)
In [144]:
fig,ax = create_standard_figure2(HC,'climate_change','#348ABD')
titlestring="Tweets referring to climate change"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['climate_change']-.005,color='#E24A33',alpha=.3,markersize=24)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
fig.savefig('climate_change.png',dpi=300)
In [85]:
fig,ax = create_standard_figure2(HC,'health_care','#348ABD')
titlestring="Tweets referring to health care"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['health_care']-.005,color='#E24A33',alpha=.3,markersize=24)
fig.savefig('health_care.png',dpi=300)
In [86]:
fig,ax = create_standard_figure2(HC,'higher_education','#348ABD')
titlestring="Tweets referring to higher education"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['higher_education']-.005,color='#E24A33',alpha=.3,markersize=24)
fig.savefig('higher_education.png',dpi=300)
In [87]:
fig,ax = create_standard_figure2(HC,'diversity','#348ABD')
titlestring="Tweets referring to diversity"
plt.annotate(titlestring,
xy=(0.5,.95),xycoords='figure fraction',
size=18,color='black',fontweight="bold",
horizontalalignment='center', verticalalignment='center',**hfont)
plot_date(DT['created_at'], DT['diversity']-.005,color='#E24A33',alpha=.3,markersize=24)
fig.savefig('diversity.png',dpi=300)
In [50]:
DT[DT['guncontrol']==1].text
Out[50]:
In [ ]:
In [105]:
def n_grams(tweets, n=1):
bigram_vectorizer = CountVectorizer(ngram_range=(n,n),token_pattern=r'\b\w+\b', min_df=1)
X_2 = bigram_vectorizer.fit_transform(tweets)
freqs = [(word, X_2.getcol(idx).sum()) for word, idx in bigram_vectorizer.vocabulary_.items()]
#sort from largest to smallest
ngramlist = sorted (freqs, key = lambda x: -x[1])
return ngramlist[:30]
def n_grams_sw(tweets, n=1):
bigram_vectorizer = CountVectorizer(ngram_range=(n,n),token_pattern=r'\b\w+\b',stop_words='english')
X_2 = bigram_vectorizer.fit_transform(tweets)
freqs = [(word, X_2.getcol(idx).sum()) for word, idx in bigram_vectorizer.vocabulary_.items()]
#sort from largest to smallest
ngramlist = sorted (freqs, key = lambda x: -x[1])
return ngramlist[:30]
In [265]:
plt.figure()
ax = DT.plot.scatter(x='favorite_count',y='retweet_count',alpha=0.3,color='red')
HC.plot.scatter(x='favorite_count',y='retweet_count',alpha=0.3,color='blue',ax=ax)
Out[265]:
In [267]:
DT.head(1)
Out[267]:
In [274]:
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
parameters = {
'vect__max_df': (0.5, 0.75, 1.0),
#'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2),(1,3),(1,4)), # unigrams or bigrams
'tfidf__use_idf': (True, False),
'tfidf__norm': ('l1', 'l2'),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
'clf__n_iter': (10, 50, 80),
}
X = HCDF['etext']
y = HCDF['speaker']
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=cv)
grid.fit(X, y)
print("The best parameters are %s with a score of %0.2f"
% (grid.best_params_, grid.best_score_))
In [443]:
print grid.predict(['I am the only one who can save the country'])
print grid.predict(['good'])
In [326]:
print grid.predict(['why do you pay your taxes but doesn"t?? watch new video with @moveon'])
In [320]:
pickle.dump(grid, open('DTHC_predictor.p', 'wb'))
In [542]:
HC.head(2)
Out[542]:
In [515]:
HC['diversity']=HC_diversity
DT['diversity']=DT_diversity
HC['guns']=HC_guns
DT['guns']=DT_guns
HC['abortion']=HC_abortion
DT['abortion']=DT_abortion
HC['immigration']=HC_immigration
DT['immigration']=DT_immigration
HC['money']=HC_money
DT['money']=DT_money
HC['nationaldefense']=HC_nationaldefense
DT['nationaldefense']=DT_nationaldefense
HC['climatechange']=HC_climatechange
DT['climatechange']=DT_climatechange
HC['healthcare']=HC_healthcare
DT['healthcare']=DT_healthcare
HC['highereducation']=HC_highereducation
DT['highereducation']=DT_highereducation
In [453]:
for t in HC['etext']:
if 'gun' in t:
print t
In [341]:
X = HCDF[['hour','day','char_length','word_length']]
y = HCDF['retweet_count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
rng = np.random.RandomState(1)
# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
n_estimators=300, random_state=rng)
regr_1.fit(X_train, y_train)
regr_2.fit(X_train, y_train)
# Predict
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
#print regr.score(X_test,y_test)
In [343]:
print regr_1.score(X_test,y_test)
print regr_2.score(X_test, y_test)
In [399]:
HC.sort_values('favorite_count',ascending=False)[['text','favorite_count']].head(10)
Out[399]:
In [402]:
HC.iloc[349]['text']
Out[402]:
In [403]:
DT.sort_values('favorite_count',ascending=False)[['text','favorite_count']].head(10)
Out[403]:
In [409]:
DT.iloc[114]['text']
Out[409]:
In [346]:
mexico = r'[mM]ex|[lL]atin'
islam = r'[Ii]slam|[Mm]uslim'
In [436]:
job=[]
for s in HC['text']:
if re.search('job|employment',s):
job.append(1)
else:
job.append(0)
In [437]:
print HC.shape
print len(job)
In [438]:
HCissues=pd.DataFrame()
HCissues['date']=HC['created_at']
HCissues['jobs']=job
In [441]:
HCissues.plot()
Out[441]:
In [356]:
def n_grams_topic(tweets, n=1,topic=''):
bigram_vectorizer = CountVectorizer(ngram_range=(n,n),token_pattern=r'\b\w+\b', min_df=1)
X_2 = bigram_vectorizer.fit_transform(tweets)
freqs = [(word, X_2.getcol(idx).sum()) for word, idx in bigram_vectorizer.vocabulary_.items() if re.search(topic,word)]
#sort from largest to smallest
ngramlist = sorted (freqs, key = lambda x: -x[1])
return ngramlist[:30]
In [362]:
HC_2_grams_islam = n_grams_topic(HCtweets, n=10,topic=mexico)
HC_2_grams_islam[0:20]
Out[362]:
In [361]:
DT_2_grams_islam = n_grams_topic(DTtweets, n=10,topic=mexico)
DT_2_grams_islam[0:20]
Out[361]:
In [381]:
DT_1grams_sw = n_grams_sw(DTtweets, n=1)
DT_1grams_sw
Out[381]:
In [380]:
HC_1grams_sw = n_grams_sw(HCtweets, n=1)
HC_1grams_sw
Out[380]:
In [153]:
HC_2grams_sw = n_grams_sw(HCtweets, n=2)
HC_2grams_sw[0:20]
Out[153]:
In [154]:
HC_2grams = n_grams(HCtweets, n=2)
HC_2grams[0:20]
Out[154]:
In [155]:
HC_3grams = n_grams(HCtweets, n=3)
HC_3grams[0:20]
Out[155]:
In [156]:
HC_4grams = n_grams(HCtweets, n=4)
HC_4grams[0:20]
Out[156]:
In [157]:
HC_5grams = n_grams(HCtweets, n=5)
HC_5grams[0:20]
Out[157]:
In [85]:
DT_bigrams = n_grams(DTtweets, n=2)
In [158]:
DT_2grams_sw = n_grams_sw(DTtweets, n=2)
DT_2grams_sw[0:20]
Out[158]:
In [86]:
DT_bigrams[0:20]
Out[86]:
In [159]:
DT_trigrams = n_grams(DTtweets, n=3)
DT_trigrams[0:20]
Out[159]:
In [160]:
DT_quadgrams = n_grams(DTtweets, n=4)
DT_quadgrams[0:20]
Out[160]:
In [104]:
for t in HCtweets:
if 'need a president who' in t:
print t
In [110]:
for t in DTtweets:
if 'I am the only' in t:
print t
In [111]:
#weak on illegal immigration
for t in DTtweets:
if 'weak on illegal immigration' in t:
print t
In [383]:
sum(1 for t in HCtweets if 'bad' in t)
Out[383]:
In [386]:
sum(1 for t in HCtweets if 'good' in t)
Out[386]:
In [384]:
sum(1 for t in DTtweets if 'bad' in t)
Out[384]:
In [385]:
sum(1 for t in DTtweets if 'good' in t)
Out[385]:
In [387]:
sum(1 for t in DTtweets if 'great' in t)
Out[387]:
In [388]:
sum(1 for t in HCtweets if 'great' in t)
Out[388]:
In [417]:
HC[r'[Mm]usl|[iI]slam', x) for x in HC['text']]]
In [ ]: