In [ ]:
import numpy as np
import nltk
from collections import Counter
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
sns.set_context("paper", font_scale=1.2)
%matplotlib notebook
%load_ext autoreload
%autoreload 2
import os
import re
import sys
sys.path.append(os.path.join(os.getcwd(), "..\..\src"))
import util.io as mio
from util import statsUtil
import util.plotting as mplot
from model.conversationDataframe import ConversationDataframe
from stats.iConvStats import IConvStats
from stats.wordsCountStats import WordsCountStats
In [ ]:
def load_text(filepaths):
"""
Load text you want to analyse.
:param filepaths: list of paths to text files to load
:return: single string representing all retrieved text
"""
text = ""
for path in filepaths:
with open(path, 'r', encoding='UTF-8') as f:
text += "\n"+f.read()
return text
In [ ]:
text = load_text([""])
In [ ]:
words = statsUtil.getWords(text)
types = set(words)
In [ ]:
print("Total length: {:.0f}".format(len(text)))
print("Tokens count: {:.0f}".format(len(words)))
print("Distinct tokens count: {:.0f}".format(len(set(words))))
print("Lexical richness: {0:.5f}".format(len(types)/len(words)))
In [ ]:
def plot_most_common(most_common_ngrams, n_most, join=False):
most_common_ngrams, count = zip(*most_common_ngrams.most_common(n_most))
if join:
most_common_ngrams = [" ".join(list(e)) for e in most_common_ngrams]
ax = sns.pointplot(y=most_common_ngrams, x=count)
sns.plt.show()
In [ ]:
# Most common words
words_count = Counter(words)
# Plot most common words
plot_most_common(words_count, n_most=30)
In [ ]:
most_common_bigrams = Counter(nltk.bigrams(words))
plot_most_common(most_common_bigrams, 20, join=True)
In [ ]:
most_common_trigrams = Counter(nltk.trigrams(words))
plot_most_common(most_common_trigrams, 20, join=True)
In [ ]:
# Get most relevant words using TF-IDF
# For this statistic we need additional pieces of text to compare with our speech transcript
# we can simply load some corpora from NLTK
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
def get_top_features(text, n):
# Load corpora for different genres
c1 = nltk.corpus.gutenberg.raw('carroll-alice.txt')
c2 = nltk.corpus.inaugural.raw("2009-Obama.txt")
c3 = nltk.corpus.webtext.raw("firefox.txt")
# Load english stopwords
stops = set(stopwords.words("english"))
# Compute TF-IDF matrix and print top results for our speech
vectorizer = TfidfVectorizer(analyzer='word',stop_words=stops, ngram_range=(2,3))
tfIdf = vectorizer.fit_transform([text, c1, c2, c3]).toarray()
indices = np.argsort(tfIdf[0])[::-1]
features = vectorizer.get_feature_names()
top_features = [features[i] for i in indices[:n] if tfIdf[0][i]!=0]
return top_features
In [ ]:
get_top_features(text, 20)
In [ ]:
# prose stats
sentences = list(filter(lambda x : len(x)>0, map(str.strip, re.split(r'[\.\?!\n]', text))))
sen_len = [len(sent) for sent in sentences]
print("Average sentence len {}. Max {}, min {}".format(np.mean(sen_len), max(sen_len), min(sen_len)))
In [ ]:
for sent in sentences:
if len(sent)>300:
print("* " + sent)