Word Frequencies & Zipf's Law

This note book is used for generating a dictionary of word frequencies across the whole corpus as well as confirming Zipf's law.


In [99]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [100]:
import pandas as pd
df0 = pd.read_csv("../data/interim/001_normalised_keyed_reviews.csv", sep="\t", low_memory=False)
df0.head()


Out[100]:
uniqueKey reviewText
0 A2XQ5LZHTD4AFT##000100039X ['timeless', 'classic', 'demanding', 'assuming...
1 AF7CSSGV93RXN##000100039X ['first', 'read', 'prophet', 'kahlil', 'gibran...
2 A1NPNGWBVD9AK3##000100039X ['one', 'first', 'literary', 'books', 'recall'...
3 A3IS4WGMFR4X65##000100039X ['prophet', 'kahlil', 'gibrans', 'best', 'know...
4 AWLFVCT9128JV##000100039X ['gibran', 'khalil', 'gibran', 'born', 'one th...

In [101]:
def convert_text_to_list(review):
    return review.replace("[","").replace("]","").replace("'","").replace("\t","").split(",")

In [102]:
# Convert "reviewText" field to back to list
df0['reviewText'] = df0['reviewText'].astype(str)
df0['reviewText'] = df0['reviewText'].progress_apply(lambda text: convert_text_to_list(text));
df0['reviewText'].head()


Progress:: 100%|██████████| 582711/582711 [00:13<00:00, 42892.56it/s]
Out[102]:
0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [103]:
# Split negs
def split_neg(review):
    new_review = []
    for token in review:
        if '_' in token:
            split_words = token.split("_")
            new_review.append(split_words[0])
            new_review.append(split_words[1])
        else:
            new_review.append(token)
    return new_review

In [104]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: split_neg(review))
df0["reviewText"].head()


Progress:: 100%|██████████| 582711/582711 [00:09<00:00, 62759.72it/s]
Out[104]:
0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [105]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(review):
    return [token for token in review if not token in stop_words]

In [106]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: remove_stopwords(review))
df0["reviewText"].head()


Progress:: 100%|██████████| 582711/582711 [00:55<00:00, 10507.46it/s]
Out[106]:
0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [107]:
import nltk
from nltk.probability import FreqDist

def collect_zipfs_law_metrics(review, fd):
    for token in review:
        fd.update([token])

In [108]:
fd = FreqDist()
df0['reviewText'].progress_apply(lambda review: collect_zipfs_law_metrics(review, fd));


Progress:: 100%|██████████| 582711/582711 [03:28<00:00, 2789.07it/s]

In [109]:
fd


Out[109]:
FreqDist({'timeless': 26,
          ' classic': 17851,
          ' demanding': 2481,
          ' assuming': 2899,
          ' title': 31208,
          ' gibran': 63,
          ' backs': 1822,
          ' excellent': 48518,
          ' style': 53946,
          ' content': 17043,
          ' means': 30436,
          ' publish': 2811,
          ' century': 34565,
          ' two': 222104,
          ' earlier': 20668,
          ' could': 186395,
          ' inspired': 7735,
          ' new': 191404,
          ' religion': 26344,
          ' mouth': 5174,
          ' old': 76688,
          ' man': 109999,
          ' sail': 783,
          ' away': 69473,
          ' far': 73068,
          ' destination': 1456,
          ' hear': 16369,
          ' wisdom': 13007,
          ' life': 259519,
          ' important': 60359,
          ' aspects': 16828,
          ' messege': 7,
          ' guide': 23941,
          ' book': 1502803,
          ' sufi': 247,
          ' sermon': 1260,
          ' much': 267441,
          ' put': 72396,
          ' perspective': 25132,
          ' without': 96163,
          ' hint': 3692,
          ' dogma': 1783,
          ' hints': 4314,
          ' birth': 10395,
          ' place': 68095,
          ' lebanon': 757,
          ' many': 273330,
          ' prophets': 1303,
          ' walked': 3332,
          ' earth': 23642,
          ' project': 11050,
          ' first': 245126,
          ' germinated': 18,
          ' likely': 21698,
          ' probably': 55570,
          ' becuase': 360,
          ' written': 124075,
          ' english': 26152,
          ' originally': 6357,
          ' writing': 124357,
          ' flows': 3164,
          ' pleasant': 4298,
          ' read': 467228,
          ' charcoal': 199,
          ' drawings': 3051,
          ' author': 213589,
          ' decorating': 850,
          ' pages': 90166,
          ' plus': 13356,
          ' loved': 53512,
          ' cover': 33005,
          'first': 13963,
          ' prophet': 2472,
          ' kahlil': 47,
          ' thirty': 16089,
          ' years': 154925,
          ' ago': 34136,
          ' times': 90304,
          ' since': 82823,
          ' gibrans': 30,
          ' messages': 4377,
          ' timeless': 2536,
          ' always': 73650,
          ' influenced': 4131,
          ' relationships': 18876,
          ' used': 60893,
          ' gift': 14420,
          ' persons': 6685,
          ' cared': 3425,
          ' paperback': 5165,
          ' version': 26043,
          ' lovely': 7584,
          ' makes': 102666,
          ' nice': 28923,
          ' yet': 81611,
          ' inexpensive': 1077,
          ' containing': 2024,
          ' valuable': 12275,
          ' lessons': 10936,
          ' anyone': 72183,
          ' purchased': 7487,
          ' three': 116777,
          ' future': 41106,
          ' giving': 26821,
          ' special': 19816,
          ' price': 19446,
          ' offered': 7517,
          'one': 29098,
          ' literary': 18325,
          ' books': 262122,
          ' recall': 3712,
          ' reading': 201307,
          ' mother': 45682,
          ' kept': 29236,
          ' collection': 19998,
          ' works': 51471,
          ' often': 68942,
          ' curious': 7600,
          ' see': 139124,
          ' attracted': 4663,
          ' looked': 11790,
          ' either': 43383,
          ' eight': 18633,
          ' nine': 20239,
          ' time': 286763,
          ' believe': 60676,
          ' taste': 10443,
          ' spirituality': 4324,
          ' seemed': 35045,
          ' relevant': 8297,
          ' forcefed': 69,
          ' nuns': 944,
          ' catechism': 598,
          ' class': 22212,
          ' rereading': 2992,
          ' im': 116336,
          ' struck': 5121,
          ' notion': 6523,
          ' hesse': 312,
          ' must': 84104,
          ' aware': 13270,
          ' texts': 6630,
          ' wrote': 27088,
          ' siddhartha': 219,
          ' contain': 5750,
          ' themes': 10700,
          ' no': 287112,
          ' else': 35728,
          ' path': 13354,
          ' select': 2125,
          ' course': 59938,
          ' preachers': 724,
          ' dime': 778,
          ' dozen': 4810,
          ' true': 72328,
          ' comes': 55815,
          ' within': 40910,
          ' teaching': 12686,
          ' love': 190146,
          ' particularly': 28809,
          ' stage': 9044,
          ' quot': 235220,
          ' even': 244133,
          ' crowns': 96,
          ' shall': 4036,
          ' crucify': 93,
          ' growth': 10497,
          ' pruning': 165,
          ' ascends': 79,
          ' height': 1588,
          ' caresses': 47,
          ' tenderest': 7,
          ' branches': 1188,
          ' quiver': 105,
          ' sun': 6416,
          ' descend': 494,
          ' roots': 5190,
          ' shake': 2577,
          ' clinging': 495,
          ' like': 386404,
          ' sheaves': 8,
          ' corn': 1936,
          ' gathers': 502,
          ' unto': 1190,
          ' threshes': 2,
          ' make': 166494,
          ' naked': 3078,
          ' sifts': 63,
          ' free': 36836,
          ' husks': 30,
          ' grinds': 153,
          ' whiteness': 111,
          ' kneads': 5,
          ' pliant': 38,
          ' assigns': 343,
          ' sacred': 4014,
          ' fire': 13932,
          ' may': 121525,
          ' become': 56985,
          ' bread': 6190,
          ' gods': 18428,
          ' feast': 1403,
          ' look': 73378,
          ' appear': 10981,
          ' simplistic': 3906,
          ' jaundiced': 143,
          ' eye': 14535,
          ' also': 267284,
          ' provide': 21321,
          ' inspiration': 5941,
          ' need': 79122,
          ' lifes': 3357,
          ' travails': 488,
          'prophet': 24,
          ' best': 118447,
          ' known': 32026,
          ' work': 159761,
          ' western': 17766,
          ' world': 183103,
          ' twenty-five': 6750,
          ' million': 9963,
          ' copies': 5054,
          ' sold': 6130,
          ' said': 62577,
          ' spent': 21141,
          ' twenty': 21761,
          ' held': 12773,
          ' onto': 7433,
          ' manuscript': 2788,
          ' four': 68224,
          ' finally': 42954,
          ' releasing': 830,
          ' publication': 5800,
          ' referred': 4221,
          ' strange': 15358,
          ' little': 148405,
          ' black': 31348,
          ' reference': 18498,
          ' working': 29410,
          ' counsels': 177,
          ' final': 25347,
          ' consisting': 832,
          ' twenty-six': 2100,
          ' verses': 2441,
          ' philosophy': 18910,
          ' east': 12340,
          ' west': 15204,
          ' meet': 18262,
          ' union': 9084,
          ' unparalleled': 536,
          ' literature': 20006,
          ' early': 44606,
          ' 20th': 7262,
          ' acknowledged': 1505,
          ' multitude': 1372,
          ' writers': 25652,
          ' evident': 4068,
          ' throughout': 39726,
          ' notably': 2289,
          ' visions': 2871,
          ' william': 9740,
          ' blake': 2842,
          ' poetprophet': 2,
          ' parexcellence': 4,
          ' incidentally': 1158,
          ' one': 639620,
          ' kahlils': 1,
          ' mentors': 582,
          ' sculptor': 226,
          ' rodin': 20,
          ' called': 32643,
          ' heavily': 5849,
          ' bible': 35360,
          ' buddhism': 3450,
          ' hinduism': 699,
          ' romantics': 203,
          ' ralph': 1288,
          ' waldo': 238,
          ' emerson': 1135,
          ' walt': 999,
          ' whitman': 484,
          ' friedrich': 272,
          ' nietzsche': 1714,
          ' ameen': 2,
          ' rihani': 1,
          ' christian': 39797,
          ' mysticism': 1504,
          ' published': 24787,
          ' critic': 2262,
          ' claude': 435,
          ' bragdon': 1,
          ' extraordinary': 6909,
          ' dramatic': 6103,
          ' power': 49571,
          ' deep': 24561,
          ' erudition': 658,
          ' lightninglike': 1,
          ' intuition': 1974,
          ' lyrical': 2087,
          ' lift': 1357,
          ' metrical': 20,
          ' mastery': 1810,
          ' message': 19232,
          ' presented': 19851,
          ' beauty': 14154,
          ' permeates': 647,
          ' entire': 33403,
          ' pattern': 6878,
          ' wow': 6258,
          ' thats': 63277,
          ' powerful': 24329,
          ' critique': 4316,
          ' lets': 15064,
          ' listen': 7983,
          ' close': 26063,
          ' friend': 37304,
          ' biographers': 721,
          ' mary': 14250,
          ' haskell': 154,
          ' received': 13357,
          ' copy': 24119,
          ' beloved': 6165,
          ' came': 43541,
          ' today': 35727,
          ' realize': 18377,
          ' hopes': 8333,
          ' compacted': 52,
          ' form': 27678,
          ' open': 25058,
          ' doors': 2656,
          ' desire': 13562,
          ' imagination': 9357,
          ' create': 22298,
          ' universe': 16756,
          ' nimbus': 24,
          ' treasures': 1099,
          ' darkness': 5908,
          ' find': 142140,
          ' heaven': 7713,
          ' generations': 5818,
          ' exhaust': 199,
          ' instead': 57080,
          ' generation': 10142,
          ' would': 355169,
          ' fain': 23,
          ' better': 116543,
          ' men': 58579,
          ' grow': 12150,
          ' riper': 18,
          ' loving': 11299,
          ' ever': 80573,
          ' add': 22471,
          ' good': 260350,
          ' point': 94274,
          'gibran': 2,
          ' khalil': 68,
          ' born': 13599,
          ' one thousand': 90898,
          ' eight hundred and eighty-three': 110,
          ' northern': 3796,
          ' nine hundred and nine': 168,
          ' went': 33696,
          ' paris': 7052,
          ' study': 28898,
          ' strict': 2486,
          ' education': 15666,
          ' traveled': 2124,
          ' eventually': 19153,
          ' moving': 17314,
          ' york': 17801,
          ' became': 28721,
          ' artist': 8031,
          ' writer': 43806,
          ' nine hundred and twenty-three': 256,
          ' generally': 13389,
          ' considered': 14084,
          ' greatest': 14928,
          ' died': 15257,
          ' cancer': 8619,
          ' hospital': 6138,
          ' young': 64388,
          ' age': 37449,
          ' forty-eight': 1177,
          ' story': 365799,
          ' almustafa': 5,
          ' living': 40748,
          ' twelve': 14218,
          ' orphalese': 3,
          ' depart': 335,
          ' aboard': 1080,
          ' ship': 7238,
          ' return': 17272,
          ' home': 54538,
          ' goes': 47571,
          ' group': 28546,
          ' people': 246448,
          ' stop': 22483,
          ' teaches': 6722,
          ' secrets': 12599,
          ' writes': 32669,
          ' poetic': 4100,
          ' manner': 14473,
          ' wonderful': 44476,
          'days': 141,
          ' gets': 42791,
          ' dismissed': 1983,
          ' hippie': 703,
          ' bestseller': 2214,
          ' long': 93686,
          ' 1960s': 3456,
          ' almost': 71656,
          ' instantly': 3618,
          ' hit': 13199,
          ' well': 247498,
          ' great': 193443,
          ' depression': 9076,
          ' claim': 13812,
          ' fame': 4012,
          ' third': 27316,
          ' bestselling': 2025,
          ' poet': 3311,
          ' behind': 31327,
          ' shakespeare': 4280,
          ' lao': 231,
          ' tzu': 404,
          ' pretty': 44103,
          ' entirely': 12133,
          ' based': 39153,
          ' sales': 5460,
          ' publisher': 8838,
          ' alfred': 1221,
          ' knopf': 200,
          ' asked': 11780,
          ' audience': 13187,
          ' flippantly': 64,
          ' question': 32085,
          ' cult': 3505,
          ' retorted': 27,
          ' whats': 17459,
          ' incredible': 11382,
          ' theres': 45481,
          ' absolutely': 22918,
          'marketing': 41,
          ' hype': 2796,
          ' success': 19908,
          ' gone': 18759,
          'political': 264,
          ' religious': 26425,
          ' commercial': 3151,
          ' enterprise': 2310,
          ' attached': 2942,
          ' name': 35366,
          ' bent': 2511,
          ' winning': 5164,
          ' souls': 5259,
          ' profits': 2334,
          ' estate': 5824,
          ' merely': 11912,
          ' licensing': 186,
          ' year': 50840,
          ' response': 8619,
          ' demand': 4267,
          ' fueled': 884,
          ' wordofmouth': 95,
          ' chance': 18855,
          ' discovery': 6938,
          ' fact': 90076,
          ' twentysix': 187,
          ' poems': 3351,
          ' surprising': 7045,
          ' suprassing': 1,
          ' relevance': 1914,
          ' insight': 18771,
          ' compassion': 5430,
          ' broken': 9832,
          ' several': 59339,
          ' topics': 14968,
          ' joy': 10679,
          ' sorrow': 1843,
          ' etc': 45945,
          ' recounts': 2561,
          ' sermons': 898,
          ' fictional': 7675,
          ' leaving': 12455,
          ' knowledge': 32032,
          ' leaves': 17493,
          ' homeland': 1765,
          ' found': 129617,
          ' setting': 19083,
          ' poem': 3052,
          ' children': 56034,
          ' local': 17859,
          ' washington': 9560,
          ' c': 18321,
          ' singers': 577,
          ' sweet': 13436,
          ' honey': 1502,
          ' rock': 7117,
          ' album': 1215,
          ' breaths': 195,
          ' sons': 6329,
          ' daughters': 6694,
          ' longing': 2122,
          ' come': 83552,
          ' though': 102839,
          ' belong': 2524,
          ' leave': 24599,
          ' college': 20031,
          ' eighteen': 6357,
          ' parents': 32025,
          ' roof': 1075,
          ' made': 114888,
          ' restless': 681,
          ' autonomy': 509,
          ' eloquently': 1208,
          ' expressed': 4155,
          ' everything': 62445,
          ' yearning': 1025,
          ' say': 111806,
          ' hours': 18005,
          ' frustration': 3706,
          ' adolescent': 1824,
          ' angst': 3332,
          ' later': 45655,
          ' proved': 4081,
          ' turn': 29043,
          ' needed': 25904,
          ' confidence': 4952,
          ' live': 43663,
          ' independent': 6945,
          ' fulfilling': 1671,
          ' still': 121749,
          ' maintaining': 2503,
          ' respect': 16022,
          ' towards': 18507,
          ' raised': 9845,
          ' understate': 269,
          ' grounded': 1943,
          ' sane': 1269,
          ' troubling': 1731,
          ' modern': 40270,
          ' lives': 57157,
          ' hectic': 364,
          ' stressful': 794,
          ' busy': 4840,
          ' wrought': 918,
          ' drama': 10406,
          ' brings': 18122,
          ' back': 121302,
          ' middle': 28798,
          ' ground': 12214,
          ' sage': 1216,
          ' clarity': 5027,
          ' helpful': 22016,
          ' unwinding': 134,
          ' coming': 28159,
          ' bring': 22670,
          ' whcih': 23,
          ' wider': 1979,
          ' despite': 30524,
          ' conceit': 644,
          ' really': 206341,
          ' applicable': 1781,
          ' atheist': 3457,
          ' poetry': 6703,
          ' mostly': 21684,
          ' imagistic': 18,
          ' expect': 20306,
          ' academic': 9406,
          ' poetics': 79,
          ' contemporaries': 1272,
          ' eliot': 1015,
          ' pound': 1632,
          ' frost': 1059,
          ' aim': 2298,
          ' accessible': 6915,
          ' immediate': 4199,
          ' reader': 112318,
          ' rely': 3419,
          ' clear': 44452,
          ' metaphors': 2452,
          ' vivid': 8303,
          ' imagery': 3471,
          ' hard': 67430,
          ' perhaps': 52616,
          ' check': 14321,
          ' table': 9879,
          ' contexts': 712,
          ' using': 40425,
          ' amazons': 1197,
          ' search': 14790,
          ' inside': 18465,
          ' feature': 4830,
          ' bookstore': 4368,
          ' addresses': 5144,
          ' problem': 48962,
          ' issue': 22876,
          ' dealing': 12805,
          ' start': 49751,
          ' chances': 2891,
          ' something': 111135,
          ' speaks': 7548,
          ' level': 31632,
          'book': 40141,
          ' took': 37653,
          ' pains': 1897,
          ' joys': 1395,
          ' millions': 7657,
          ' somehow': 13960,
          ' managed': 8714,
          ' encapsulate': 168,
          ' wonders': 4739,
          ' unique': 22512,
          ' explored': 3736,
          ' every': 110691,
          ' facet': 802,
          ' human': 67138,
          ' existence': 12149,
          ' although': 60457,
          ' charm': 3844,
          ' universal': 5139,
          ' appeal': 8259,
          ' allow': 11653,
          ' persuassion': 4,
          ' enjoy': 46710,
          ' gem': 3343,
          ' wouldnt': 20995,
          ' surprised': 15362,
          ' hundred': 10159,
          ' theologians': 1305,
          ' unearth': 217,
          ' conclude': 2650,
          'certainly': 315,
          ' words': 53707,
          ' eternally': 522,
          ' present': 29385,
          ' divine': 6226,
          ' told': 40137,
          ' source': 14365,
          ' sources': 15244,
          ' ready': 12194,
          ' receive': 4801,
          ' destined': 2260,
          ' know': 143387,
          ' meaning': 17835,
          ' meanings': 2400,
          ' five': 74282,
          ' valued': 939,
          ' keep': 61844,
          ' refering': 132,
          ' solace': 863,
          ' peace': 13994,
          ' rather': 79364,
          ' instrument': 1092,
          ' write': 45977,
          ' reached': 5150,
          ' original': 30366,
          ' available': 18320,
          ' humankind': 996,
          ' music': 15776,
          ' truth': 41242,
          ' salute': 433,
          ' us': 173033,
          ' dispenses': 201,
          ' ultimate': 9136,
          ' ones': 39616,
          ' bids': 142,
          ' fare': 1788,
          ' defines': 2249,
          ' never': 154708,
          ' define': 3509,
          ' appropriately': 1550,
          ' sense': 62476,
          ' discovered': 11310,
          ' refused': 3266,
          ' rush': 4242,
          ' lesson': 7490,
          ' understanding': 37273,
          ' ability': 22754,
          ' way': 215636,
          ' everyday': 7854,
          ' dispense': 262,
          'ending': 681,
          ' treasure': 4861,
          ' myth': 7685,
          ' whose': 28622,
          ' word': 39285,
          ' drips': 184,
          ' thought': 86511,
          ' swimming': 1180,
          ' realitybut': 4,
          ' reality': 27646,
          ' till': 3788,
          ' knew': 31335,
          ' shared': 6714,
          ' bit': 75203,
          ' wonder': 20826,
          ' simplicity': 2911,
          ' preaching': 2389,
          ' able': 50094,
          ' feel': 82118,
          ' gut': 2008,
          ' characters': 208455,
          ' unsurpassed': 265,
          ' speaking': 9772,
          ' ways': 38905,
          'known': 896,
          ' weakest': 1386,
          ' prayer': 7001,
          ' certainly': 41800,
          ' relies': 2441,
          ' jesus': 37990,
          ' allusions': 1040,
          ' wordsand': 15,
          ' actions': 19352,
          ' wineskin': 10,
          ' right': 95314,
          ' bedrock': 302,
          ' obviously': 17754,
          ' enlightened': 2330,
          ' completely': 34990,
          ' psychologically': 1111,
          ' spiritually': 1680,
          ' healthy': 12481,
          ' consider': 19483,
          ' standard': 13168,
          ' simply': 55025,
          ' simpleminded': 253,
          ' high': 49216,
          ' school': 43008,
          ' students': 18490,
          ' gaining': 1886,
          ' ten': 35773,
          ' typically': 3572,
          ' remind': 4110,
          ' however': 113633,
          ' gave': 28525,
          ' son': 29157,
          ' showed': 6833,
          ' signs': 4165,
          ' risk': 10243,
          ' seventeen': 5481,
          ' significant': 11129,
          ' positive': 18157,
          ' impact': 12259,
          ' responsible': 7794,
          ' balanced': 6230,
          ' telephone': 984,
          ' evening': 3882,
          ' passages': 9362,
          ' think': 153896,
          ' get': 203875,
          ' teenager': 4283,
          ' thing': 77277,
          'kahlil': 1,
          ' eighteen million': 894,
          ' eight hundred and thirty-one thousand': 32,
          ' nine hundred and thirty-one': 292,
          ' lebaneseamerican': 6,
          ' mystic': 1189,
          'wrote': 321,
          ' anything': 60671,
          ' finer': 711,
          ' volume': 20528,
          ' masterwork': 368,
          ' nothing': 71359,
          ' beyond': 29373,
          ' remembered': 3220,
          ' perpetuity': 52,
          ' brief': 13619,
          ' chapter': 76724,
          ' aspect': 12907,
          ' condition': 6247,
          ' including': 38056,
          ' marriage': 25393,
          ' pleasure': 10698,
          ' buying': 13278,
          ' selling': 6690,
          ' eating': 14794,
          ' drinking': 4903,
          ' espouses': 370,
          'particular': 668,
          ' ethical': 3198,
          ' moral': 15820,
          ' system': 33247,
          ' includes': 17331,
          ' slim': 2160,
          ' tome': 3488,
          ' seems': 83776,
          ' quill': 146,
          ' light': 33388,
          ' ink': 1620,
          ' higher': 11493,
          ' whatever': 17324,
          ' call': 25326,
          ' spoke': 3822,
          ' perfect': 31956,
          ' letter': 7452,
          ' ages': 8063,
          ' expression': 3698,
          ' humanity': 10541,
          'father': 210,
          ' huge': 20952,
          ' remember': 20796,
          ' around': 74445,
          ' amidst': 1287,
          ' sea': 9536,
          ' small': 43935,
          ' sit': 7001,
          ' changed': 18328,
          ' talks': 11414,
          ' friendship': 8274,
          ' death': 53006,
          ' answers': 14887,
          ' spend': 15495,
          ' searching': 5804,
          ' tells': 31320,
          ' things': 130509,
          ' already': 48218,
          ' heads': 5675,
          ' dont': 178664,
          ' ignoring': 2832,
          ' default': 1555,
          'place': 959,
          ' go': 97212,
          ' doesnt': 96338,
          'follow': 1100,
          ' teacher': 11606,
          ' despised': 895,
          ' talking': 17307,
          ' favorite': 33007,
          ' got': 64427,
          ' excited': 8048,
          ' thinking': 42289,
          ' common': 27545,
          ' someone': 59821,
          ' despise': 991,
          ' guess': 17668,
          'forget': 2503,
          ' waited': 2678,
          ' coastal': 554,
          ' town': 22080,
          ' bear': 6647,
          ' misses': 2454,
          ' waiting': 14088,
          ' knows': 33424,
          ' mystery': 39976,
          ' departs': 322,
          ' townspeople': 534,
          ' gather': 2072,
          ' wish': 28775,
          ' seeress': 20,
          ' asks': 7300,
          ' share': 16983,
          ' endure': 2545,
          ' reveals': 7657,
          ' pain': 16571,
          ' profound': 7812,
          ' advice': 26285,
          ' judge': 6905,
          ' together': 59216,
          ' forevermore': 56,
          ' white': 29848,
          ' wings': 1952,
          ' scatter': 141,
          ' days': 39957,
          ' aye': 71,
          ' silent': 3269,
          ' memory': 11137,
          ' god': 77025,
          ' let': 38137,
          ' spaces': 1242,
          ' togetherness': 128,
          ' winds': 1838,
          ' heavens': 1280,
          ' dance': 3634,
          ' another': 113957,
          ' bond': 5871,
          ' shores': 647,
          ' fill': 6672,
          ' others': 79338,
          ' cup': 3637,
          ' drink': 4667,
          ' give': 83120,
          ' eat': 17151,
          ' loaf': 629,
          ' sing': 1608,
          ' joyous': 432,
          ' alone': 28148,
          ' strings': 1438,
          ' lute': 52,
          ' hearts': 5487,
          ' keeping': 12013,
          ' hand': 32928,
          ' stand': 16953,
          ' near': 15817,
          ' pillars': 862,
          ' temple': 3022,
          ' apart': 12487,
          ' oak': 481,
          ' tree': 5364,
          ' cypress': 67,
          ' shadow': 5529,
          ' similar': 22730,
          ' thetao': 4,
          ' te': 453,
          ' ching': 621,
          ' perennial': 513,
          ' classics': 4022,
          ' border': 2805,
          ' guard': 3452,
          ' recognises': 115,
          ' exile': 1652,
          ' five hundred': 5794,
          ' translated': 3484,
          ' tao': 1181,
          ' contains': 14736,
          ' principles': 11002,
          ' use': 82742,
          ' youre': 47915,
          ' forget': 13048,
          ' ego': 3799,
          ' opens': 6833,
          ' recommend': 72905,
          ' stephen': 7900,
          ' mitchell': 1825,
          ' hope': 43359,
          ' useful': 25187,
          ' menu': 1473,
          ' lebanese': 256,
          ' restaurant': 4018,
          ' sought': 3652,
          ' fully': 15178,
          ' parts': 29932,
          ' barely': 8783,
          ' touch': 11434,
          ' onlove': 1,
          ' insecurities': 1259,
          ' doubts': 3318,
          ' going': 96882,
          ' rough': 4373,
          ' unanswered': 1984,
          ' questions': 33958,
          ' certain': 30623,
          ' lucid': 2129,
          ' insightful': 8009,
          ' foodi': 2,
          ' thankfully': 3102,
          ' given': 54338,
          ' journeyed': 168,
          ' overseas': 1274,
          ' spiritual': 21026,
          ' quest': 8212,
          'got': 3843,
          ' month': 7841,
          ' lost': 40945,
          ' job': 57801,
          ' relationship': 45996,
          ' unsteady': 71,
          ' felt': 59471,
          ' isolated': 3057,
          ' skipping': 2552,
          ' sections': 14003,
          ' immediately': 16292,
          ' pertained': 79,
          ' cried': 2788,
          ' safe': 9035,
          ' sadness': 2896,
          ' enlightenment': 3943,
          ' experiences': 19777,
          ' thoughts': 20840,
          ' detailed': 19873,
          ' accounts': 9831,
          ' applies': 2914,
          ' everyone': 45871,
          ' writings': 8782,
          ' layers': 3174,
          ' therefore': 13663,
          ' breaking': 5331,
          ' shell': 3811,
          ' encloses': 21,
          'atheist': 46,
          ' seem': 42743,
          ' holds': 7370,
          ' references': 15229,
          ' walks': 4105,
          ' relate': 8936,
          ' prose': 19488,
          ' greatly': 7090,
          ' celebrated': 1474,
          ' countries': 11443,
          ' simple': 38385,
          ' biting': 1369,
          ' phrases': 4413,
          ' sentences': 8528,
          ' legacy': 4322,
          ' thoughtaltering': 1,
          ' quote': 8122,
          ' uses': 23670,
          ' short': 55582,
          ...})

In [110]:
words = []
freqs = []

In [111]:
for rank, word in enumerate(fd):
    words.append(word)
    freqs.append(fd[word])

In [112]:
frequencies = {'word': words, 'frequency':freqs}
frequencies_df = pd.DataFrame(frequencies)

In [113]:
frequencies_df.head()


Out[113]:
frequency word
0 26 timeless
1 17851 classic
2 2481 demanding
3 2899 assuming
4 31208 title

In [114]:
frequencies_df = frequencies_df.sort_values(['frequency'], ascending=[False])
frequencies_df = frequencies_df.reset_index()
frequencies_df = frequencies_df.drop(columns=['index'])

In [115]:
frequencies_df[0:20]


Out[115]:
frequency word
0 1502803 book
1 639620 one
2 467228 read
3 386404 like
4 365799 story
5 355169 would
6 287112 no
7 286763 time
8 273330 many
9 267441 much
10 267284 also
11 262122 books
12 260350 good
13 259519 life
14 247498 well
15 246448 people
16 245126 first
17 244133 even
18 235220 quot
19 222104 two

In [116]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')


0.12.1

In [117]:
plotly.tools.set_credentials_file(username='falrashidi', api_key='XaO64TRYU0N3Sdup8Z3H')

In [118]:
frequencies_df['frequency'][0:75].iplot(kind='bar', xTitle='Words', yTitle='Frequency', title='Occurences in the Corpus per Word (Zipf\'s Law)')


Out[118]:

In [119]:
frequencies_df.to_csv("../data/interim/003_dictionary.csv", sep='\t', header=True, index=False);

In [128]:
df = frequencies_df.reindex_axis(sorted(frequencies_df.columns, reverse=True), axis=1)


/Users/falehalrashidi/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning:

'.reindex_axis' is deprecated and will be removed in a future version. Use '.reindex' instead.


In [131]:
sorted(frequencies_df.columns, reverse=True)


Out[131]:
['word', 'frequency']

In [133]:
final_df = frequencies_df.reindex(['word', 'frequency'], axis=1)

In [136]:
# Save a dictionary into a pickle file.
final_df.to_pickle("../data/interim/003_dictionary.p")

In [ ]:
# END_OF_FILE