In [155]:
import string
import nltk
import re
import json
import pandas as pd
import numpy as np
import pickle
import csv
from textstat.textstat import *
from vaderSentiment.vaderSentiment import sentiment as VS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import cross_validation
In [128]:
##Defining stopwords list
stopwords = nltk.corpus.stopwords.words("english")
stemmer = nltk.stem.porter.PorterStemmer()
def tokenize(page):
tokens = page.split()
#Remove stopwords
#tokens = [t for t in tokens if t not in stopwords]
#stem tokens
tokens = [stemmer.stem(t) for t in tokens]
return tokens
In [4]:
m1 = "The mid 2010s have witnessed a resurgence of nationalist discourse in the United States, mirroring longer-term trends in the European public sphere. Politicians on both sides of the Atlantic have articulated visions of their nations under siege—by immigrants, refugees, domestic minority pop- ulations, and these groups’ ostensible accomplices among the political and cultural elites. Evoking nostalgia for the nation’s bygone glory days, these diagnoses have been coupled with sundry pol- icy proposals aimed at making the country great again, to paraphrase Donald Trump’s campaign slogan: from the tightening of national borders, increased surveillance of national populations, and scaling back of supranational integration to an ill-fitting mix of foreign policy isolationism and hawkish calls for unilateral projection of military power abroad. Narratives of the nation’s putative failings have resonated with beliefs deeply held by large segments of the voting public, laying bare cultural cleavages that are likely to shape election outcomes, policy decisions, and social movement mobilization."
In [5]:
m2 = "What might a research program built on such a foundation look like? First, it should consider nationalism from the bottom up, as a set of intersubjective meanings and affective orientations that give people a sense of self and guide their social interactions and political choices. Such a shift would imply not only a focus on popular beliefs and attitudes, but also the understanding that nationhood is only one source of identity, whose salience depends on a variety of contextual factors. Second, such research should explicitly consider the heterogeneity of vernacular conceptions of the nation within any given polity. The nation is not a static cultural object with a single shared meaning, but a site of active political contestation between cultural communities with strikingly different belief systems. Such conflicts are at the heart of contemporary political debates in the United States and Europe."
In [6]:
m3 = "All three objectives require scholars’ engagement with meanings held by individuals embedded in concrete social environments. If the nation is not just a political entity but also a cognitive frame through which people apprehend social reality and construct routinized strategies of action, research on nationalism must incorporate insights from cultural sociology and social psychology about how meanings structured by institutions shape social interaction and group relations. This suggests a research strategy that views dispositions toward the nation as relational, intersubjective, morally and affectively laden, and largely taken for granted. The resulting empirical investigations are likely to require an adaptation of existing research methods and the exploitation of new sources of data. Fortunately, the constitutive elements of this research agenda already exist; what is needed is their integration across disciplinary and methodological boundaries."
In [7]:
papers = [m1,m2,m3]
In [121]:
def word_count(paper):
paper = paper.split()
return len(paper)
def char_count(word):
return len(word)
def avg_word_length(paper):
words = word_count(paper)
p=paper.split()
char_total = 0
for word in p:
char_total += char_count(word)
return round(float(char_total)/float(words), 3)
def max_word_length(paper):
p=paper.split()
max_length = 0
for word in p:
if len(word) > max_length:
max_length = len(word)
return max_length
def avg_syllables_per_word(paper):
return textstat.avg_syllables_per_word(paper)
def avg_letters_per_word(paper):
return textstat.avg_letter_per_word(paper)
def num_difficult_words(paper):
return textstat.difficult_words(paper)
def num_polysyllable(paper):
return textstat.polysyllabcount(paper)
def num_sentences(paper):
return textstat.sentence_count(paper)
def ref_count_super_basic(paper):
return paper.count(')')
def url_count(paper):
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
parsed_text = re.sub(giant_url_regex, 'URL', paper)
return paper.count('URL')
In [122]:
def basic_stats(paper):
return [word_count(paper), avg_word_length(paper), max_word_length(paper), avg_syllables_per_word(paper) /
avg_letters_per_word(paper), num_difficult_words(paper), num_polysyllable(paper), num_sentences(paper)] #/
# ref_count_super_basic(paper), url_count(paper)]
In [114]:
def readability(paper):
fkg = textstat.flesch_kincaid_grade(paper)
fre = textstat.flesch_reading_ease(paper)
dcr = textstat.dale_chall_readability_score(paper)
smg = textstat.smog_index(paper)
cli = textstat.coleman_liau_index(paper)
ari = textstat.automated_readability_index(paper)
return [fkg,fre,dcr,smg,cli,ari]
In [ ]:
#Get an SAT/GRE word list and use counts of the words as features
In [165]:
def avg_readability(readability_scores):
readability = np.array(readability_scores)
mean = np.mean(readability)
sd = np.std(readability)
return [mean, sd]
In [123]:
paper_stats = []
for p in papers:
rd = readability(p)
avgs = avg_readability(rd)
bs = basic_stats(p)
stats = rd+avgs+bs
for i in range(0, len(stats)):
stats[i] = round(stats[i], 3)
paper_stats.append(stats)
In [96]:
In [ ]:
In [124]:
results = np.array(paper_stats)
In [125]:
results
Out[125]:
In [130]:
vectorizer = TfidfVectorizer(
#vectorizer = sklearn.feature_extraction.text.CountVectorizer(
tokenizer=tokenize,
ngram_range=(2, 3),
#stop_words=stopwords, #We do better when we keep stopwords
lowercase=True,
use_idf=True,
smooth_idf=True,
norm='l2',
decode_error='replace'
)
In [131]:
results = pd.DataFrame(results)
In [147]:
text = np.array(papers)
text = pd.DataFrame(text)
In [135]:
labels = pd.Series([0,1,0])
Out[135]:
In [148]:
results['text'] = text
results['labels'] = labels
In [156]:
results
Out[156]:
In [157]:
# Split into features and target
y = results['labels']
X_strings = results['text']
vectorizer.fit(X_strings) # fit vectorizer here
# X is our sparse matrix of predictors
X = vectorizer.transform(X_strings)
In [163]:
#MODELING
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
X,
y,
test_size=0.1,
random_state=42,
)
bnb = naive_bayes.BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
# Evaluate this classifier! #
# Precision: tp / (tp + fp)
# Recall: tp / (tp + fn)
# F_1: 2 * (precision * recall) / precision + recall
# e.g. harmonic mean of precision and recall
print(
'Baseline guessing is {}'.format(float(sum(y_train)) / len(y_train))
)
print(
'The precision is {}'.format(metrics.precision_score(y_test, y_pred))
)
print(
'The recall is {}'.format(metrics.recall_score(y_test, y_pred))
)
print(
'The f score is {}'.format(metrics.f1_score(y_test, y_pred))
)
In [164]:
y_pred
Out[164]:
In [ ]: