In [ ]:
import numpy as np
import nltk
from collections import Counter
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sns.set_context("paper", font_scale=1.2)

%matplotlib notebook
%load_ext autoreload
%autoreload 2

import os
import re
import sys
sys.path.append(os.path.join(os.getcwd(), "..\..\src"))

import util.io as mio
from util import statsUtil
import util.plotting as mplot
from model.conversationDataframe import ConversationDataframe
from stats.iConvStats import IConvStats
from stats.wordsCountStats import WordsCountStats

Intro

This notebook is used as utility/tool for analysis of text. The goal is to get some insight about the structure, content and quality of the text.

Examples: analysis of CV, personal articles, job ads.

Load Text

Load text you want to analyse


In [ ]:
def load_text(filepaths):
    """
    Load text you want to analyse.
    :param filepaths: list of paths to text files to load
    :return: single string representing all retrieved text
    """
    text = ""
    for path in filepaths:
        with open(path, 'r', encoding='UTF-8') as f:
            text += "\n"+f.read()
    return text

In [ ]:
text = load_text([""])

Basic Stats

Length, count and richness, Ngram distribution and mosr relevant features.


In [ ]:
words = statsUtil.getWords(text)
types = set(words)

In [ ]:
print("Total length: {:.0f}".format(len(text)))
print("Tokens count: {:.0f}".format(len(words)))
print("Distinct tokens count: {:.0f}".format(len(set(words))))
print("Lexical richness: {0:.5f}".format(len(types)/len(words)))

In [ ]:
def plot_most_common(most_common_ngrams, n_most, join=False):
    most_common_ngrams, count = zip(*most_common_ngrams.most_common(n_most))
    if join:
        most_common_ngrams = [" ".join(list(e)) for e in most_common_ngrams]
    ax = sns.pointplot(y=most_common_ngrams, x=count)
    sns.plt.show()

In [ ]:
# Most common words
words_count = Counter(words)

# Plot most common words
plot_most_common(words_count, n_most=30)

In [ ]:
most_common_bigrams = Counter(nltk.bigrams(words))

plot_most_common(most_common_bigrams, 20, join=True)

In [ ]:
most_common_trigrams = Counter(nltk.trigrams(words))

plot_most_common(most_common_trigrams, 20, join=True)

In [ ]:
# Get most relevant words using TF-IDF
# For this statistic we need additional pieces of text to compare with our speech transcript
# we can simply load some corpora from NLTK 

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def get_top_features(text, n):
    # Load corpora for different genres
    c1 = nltk.corpus.gutenberg.raw('carroll-alice.txt')
    c2 = nltk.corpus.inaugural.raw("2009-Obama.txt")
    c3 = nltk.corpus.webtext.raw("firefox.txt")
    # Load english stopwords
    stops = set(stopwords.words("english"))

    # Compute TF-IDF matrix and print top results for our speech
    vectorizer = TfidfVectorizer(analyzer='word',stop_words=stops, ngram_range=(2,3))
    tfIdf = vectorizer.fit_transform([text, c1, c2, c3]).toarray()
    indices = np.argsort(tfIdf[0])[::-1]
    features = vectorizer.get_feature_names()
    top_features = [features[i] for i in indices[:n] if tfIdf[0][i]!=0]
    return top_features

In [ ]:
get_top_features(text, 20)

Prose Stats

“Over the whole document, make the average sentence length 15-20 words, 25-33 syllables and 75-100 characters.”


In [ ]:
# prose stats
sentences = list(filter(lambda x : len(x)>0, map(str.strip, re.split(r'[\.\?!\n]', text))))
sen_len = [len(sent) for sent in sentences]
print("Average sentence len {}. Max {}, min {}".format(np.mean(sen_len), max(sen_len), min(sen_len)))

In [ ]:
for sent in sentences:
    if len(sent)>300:
        print("* " + sent)