notebook.community

Edit and run



In [155]:

    
import string
import nltk
import re
import json
import pandas as pd
import numpy as np
import pickle
import csv
from textstat.textstat import *
from vaderSentiment.vaderSentiment import sentiment as VS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import cross_validation



In [128]:

    
##Defining stopwords list
stopwords = nltk.corpus.stopwords.words("english")
stemmer = nltk.stem.porter.PorterStemmer()

def tokenize(page):
    tokens = page.split()
    #Remove stopwords
    #tokens = [t for t in tokens if t not in stopwords]
    #stem tokens
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens



In [4]:

    
m1 = "The mid 2010s have witnessed a resurgence of nationalist discourse in the United States, mirroring longer-term trends in the European public sphere. Politicians on both sides of the Atlantic have articulated visions of their nations under siege—by immigrants, refugees, domestic minority pop- ulations, and these groups’ ostensible accomplices among the political and cultural elites. Evoking nostalgia for the nation’s bygone glory days, these diagnoses have been coupled with sundry pol- icy proposals aimed at making the country great again, to paraphrase Donald Trump’s campaign slogan: from the tightening of national borders, increased surveillance of national populations, and scaling back of supranational integration to an ill-fitting mix of foreign policy isolationism and hawkish calls for unilateral projection of military power abroad. Narratives of the nation’s putative failings have resonated with beliefs deeply held by large segments of the voting public, laying bare cultural cleavages that are likely to shape election outcomes, policy decisions, and social movement mobilization."



In [5]:

    
m2 = "What might a research program built on such a foundation look like? First, it should consider nationalism from the bottom up, as a set of intersubjective meanings and affective orientations that give people a sense of self and guide their social interactions and political choices. Such a shift would imply not only a focus on popular beliefs and attitudes, but also the understanding that nationhood is only one source of identity, whose salience depends on a variety of contextual factors. Second, such research should explicitly consider the heterogeneity of vernacular conceptions of the nation within any given polity. The nation is not a static cultural object with a single shared meaning, but a site of active political contestation between cultural communities with strikingly different belief systems. Such conflicts are at the heart of contemporary political debates in the United States and Europe."



In [6]:

    
m3 = "All three objectives require scholars’ engagement with meanings held by individuals embedded in concrete social environments. If the nation is not just a political entity but also a cognitive frame through which people apprehend social reality and construct routinized strategies of action, research on nationalism must incorporate insights from cultural sociology and social psychology about how meanings structured by institutions shape social interaction and group relations. This suggests a research strategy that views dispositions toward the nation as relational, intersubjective, morally and affectively laden, and largely taken for granted. The resulting empirical investigations are likely to require an adaptation of existing research methods and the exploitation of new sources of data. Fortunately, the constitutive elements of this research agenda already exist; what is needed is their integration across disciplinary and methodological boundaries."



In [7]:

    
papers = [m1,m2,m3]



In [121]:

    
def word_count(paper):
    paper = paper.split()
    return len(paper)

def char_count(word):
    return len(word)

def avg_word_length(paper):
    words = word_count(paper)
    p=paper.split()
    char_total = 0
    for word in p:
        char_total += char_count(word)
    return round(float(char_total)/float(words), 3)
        
def max_word_length(paper):
    p=paper.split()
    max_length = 0
    for word in p:
        if len(word) > max_length:
            max_length = len(word)
    return max_length

def avg_syllables_per_word(paper):
    return textstat.avg_syllables_per_word(paper)

def avg_letters_per_word(paper):
    return textstat.avg_letter_per_word(paper)

def num_difficult_words(paper):
    return textstat.difficult_words(paper)

def num_polysyllable(paper):
    return textstat.polysyllabcount(paper)

def num_sentences(paper):
    return textstat.sentence_count(paper)

def ref_count_super_basic(paper):
    return paper.count(')')

def url_count(paper):
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    parsed_text = re.sub(giant_url_regex, 'URL', paper)
    return paper.count('URL')



In [122]:

    
def basic_stats(paper):
    return [word_count(paper), avg_word_length(paper), max_word_length(paper), avg_syllables_per_word(paper) /
            avg_letters_per_word(paper), num_difficult_words(paper), num_polysyllable(paper), num_sentences(paper)] #/
        
          # ref_count_super_basic(paper), url_count(paper)]



In [114]:

    
def readability(paper):
    fkg = textstat.flesch_kincaid_grade(paper)
    fre = textstat.flesch_reading_ease(paper)
    dcr = textstat.dale_chall_readability_score(paper)
    smg = textstat.smog_index(paper)
    cli = textstat.coleman_liau_index(paper)
    ari = textstat.automated_readability_index(paper)
    return [fkg,fre,dcr,smg,cli,ari]



In [ ]:

    
#Get an SAT/GRE word list and use counts of the words as features



In [165]:

    
def avg_readability(readability_scores):
    readability = np.array(readability_scores)
    mean = np.mean(readability)
    sd = np.std(readability)
    return [mean, sd]



In [123]:

    
paper_stats = []
for p in papers:
    rd = readability(p)
    avgs = avg_readability(rd)
    bs = basic_stats(p)
    stats = rd+avgs+bs
    for i in range(0, len(stats)):
        stats[i] = round(stats[i], 3)
    paper_stats.append(stats)



In [96]:



In [ ]:



In [124]:

    
results = np.array(paper_stats)



In [125]:

    
results









    Out[125]:





array([[  22.   ,    6.51 ,   12.96 ,   14.6  ,   18.81 ,   26.9  ,
          16.963,    6.563,  156.   ,    6.122,   13.   ,    0.31 ,
          73.   ,   16.   ,    4.   ],
       [  13.4  ,   39.67 ,   10.23 ,   11.2  ,   14.45 ,   15.9  ,
          17.475,   10.105,  142.   ,    5.415,   15.   ,    0.314,
          49.   ,   16.   ,    6.   ],
       [  18.2  ,   11.25 ,   12.1  ,   15.9  ,   19.61 ,   21.5  ,
          16.427,    3.76 ,  132.   ,    6.311,   16.   ,    0.317,
          60.   ,   25.   ,    5.   ]])



In [130]:

    
vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    ngram_range=(2, 3),
    #stop_words=stopwords, #We do better when we keep stopwords
    lowercase=True,
    use_idf=True,
    smooth_idf=True,
    norm='l2',
    decode_error='replace'
    )



In [131]:

    
results = pd.DataFrame(results)



In [147]:

    
text = np.array(papers)
text = pd.DataFrame(text)



In [135]:

    
labels = pd.Series([0,1,0])









    Out[135]:





0    0
1    1
2    0
dtype: int64



In [148]:

    
results['text'] = text
results['labels'] = labels



In [156]:

    
results









    Out[156]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      labels
      text
    
  
  
    
      0
      22.0
      6.51
      12.96
      14.6
      18.81
      26.9
      16.963
      6.563
      156.0
      6.122
      13.0
      0.310
      73.0
      16.0
      4.0
      0
      The mid 2010s have witnessed a resurgence of n...
    
    
      1
      13.4
      39.67
      10.23
      11.2
      14.45
      15.9
      17.475
      10.105
      142.0
      5.415
      15.0
      0.314
      49.0
      16.0
      6.0
      1
      What might a research program built on such a ...
    
    
      2
      18.2
      11.25
      12.10
      15.9
      19.61
      21.5
      16.427
      3.760
      132.0
      6.311
      16.0
      0.317
      60.0
      25.0
      5.0
      0
      All three objectives require scholars’ engagem...



In [157]:

    
# Split into features and target
y = results['labels']
X_strings = results['text']
vectorizer.fit(X_strings) # fit vectorizer here
# X is our sparse matrix of predictors
X = vectorizer.transform(X_strings)



In [163]:

    
#MODELING
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X,
    y,
    test_size=0.1,
    random_state=42,
)

bnb = naive_bayes.BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)

# Evaluate this classifier! #
# Precision: tp / (tp + fp)
# Recall: tp / (tp + fn)
# F_1: 2 * (precision * recall) / precision + recall
#   e.g. harmonic mean of precision and recall
print(
    'Baseline guessing is {}'.format(float(sum(y_train)) / len(y_train))
)
print(
    'The precision is {}'.format(metrics.precision_score(y_test, y_pred))
)
print(
    'The recall is {}'.format(metrics.recall_score(y_test, y_pred))
)
print(
    'The f score is {}'.format(metrics.f1_score(y_test, y_pred))
)



In [164]:

    
y_pred









    Out[164]:





array([0])



In [ ]:

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	labels	text
0	22.0	6.51	12.96	14.6	18.81	26.9	16.963	6.563	156.0	6.122	13.0	0.310	73.0	16.0	4.0	0	The mid 2010s have witnessed a resurgence of n...
1	13.4	39.67	10.23	11.2	14.45	15.9	17.475	10.105	142.0	5.415	15.0	0.314	49.0	16.0	6.0	1	What might a research program built on such a ...
2	18.2	11.25	12.10	15.9	19.61	21.5	16.427	3.760	132.0	6.311	16.0	0.317	60.0	25.0	5.0	0	All three objectives require scholars’ engagem...