In [99]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas
# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)
In [100]:
import pandas as pd
df0 = pd.read_csv("../data/interim/001_normalised_keyed_reviews.csv", sep="\t", low_memory=False)
df0.head()
Out[100]:
uniqueKey
reviewText
0
A2XQ5LZHTD4AFT##000100039X
['timeless', 'classic', 'demanding', 'assuming...
1
AF7CSSGV93RXN##000100039X
['first', 'read', 'prophet', 'kahlil', 'gibran...
2
A1NPNGWBVD9AK3##000100039X
['one', 'first', 'literary', 'books', 'recall'...
3
A3IS4WGMFR4X65##000100039X
['prophet', 'kahlil', 'gibrans', 'best', 'know...
4
AWLFVCT9128JV##000100039X
['gibran', 'khalil', 'gibran', 'born', 'one th...
In [101]:
def convert_text_to_list(review):
return review.replace("[","").replace("]","").replace("'","").replace("\t","").split(",")
In [102]:
# Convert "reviewText" field to back to list
df0['reviewText'] = df0['reviewText'].astype(str)
df0['reviewText'] = df0['reviewText'].progress_apply(lambda text: convert_text_to_list(text));
df0['reviewText'].head()
Progress:: 100%|██████████| 582711/582711 [00:13<00:00, 42892.56it/s]
Out[102]:
0 [timeless, classic, demanding, assuming, t...
1 [first, read, prophet, kahlil, gibran, th...
2 [one, first, literary, books, recall, rea...
3 [prophet, kahlil, gibrans, best, known, w...
4 [gibran, khalil, gibran, born, one thousan...
Name: reviewText, dtype: object
In [103]:
# Split negs
def split_neg(review):
new_review = []
for token in review:
if '_' in token:
split_words = token.split("_")
new_review.append(split_words[0])
new_review.append(split_words[1])
else:
new_review.append(token)
return new_review
In [104]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: split_neg(review))
df0["reviewText"].head()
Progress:: 100%|██████████| 582711/582711 [00:09<00:00, 62759.72it/s]
Out[104]:
0 [timeless, classic, demanding, assuming, t...
1 [first, read, prophet, kahlil, gibran, th...
2 [one, first, literary, books, recall, rea...
3 [prophet, kahlil, gibrans, best, known, w...
4 [gibran, khalil, gibran, born, one thousan...
Name: reviewText, dtype: object
In [105]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(review):
return [token for token in review if not token in stop_words]
In [106]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: remove_stopwords(review))
df0["reviewText"].head()
Progress:: 100%|██████████| 582711/582711 [00:55<00:00, 10507.46it/s]
Out[106]:
0 [timeless, classic, demanding, assuming, t...
1 [first, read, prophet, kahlil, gibran, th...
2 [one, first, literary, books, recall, rea...
3 [prophet, kahlil, gibrans, best, known, w...
4 [gibran, khalil, gibran, born, one thousan...
Name: reviewText, dtype: object
In [107]:
import nltk
from nltk.probability import FreqDist
def collect_zipfs_law_metrics(review, fd):
for token in review:
fd.update([token])
In [108]:
fd = FreqDist()
df0['reviewText'].progress_apply(lambda review: collect_zipfs_law_metrics(review, fd));
Progress:: 100%|██████████| 582711/582711 [03:28<00:00, 2789.07it/s]
In [109]:
fd
Out[109]:
FreqDist({'timeless': 26,
' classic': 17851,
' demanding': 2481,
' assuming': 2899,
' title': 31208,
' gibran': 63,
' backs': 1822,
' excellent': 48518,
' style': 53946,
' content': 17043,
' means': 30436,
' publish': 2811,
' century': 34565,
' two': 222104,
' earlier': 20668,
' could': 186395,
' inspired': 7735,
' new': 191404,
' religion': 26344,
' mouth': 5174,
' old': 76688,
' man': 109999,
' sail': 783,
' away': 69473,
' far': 73068,
' destination': 1456,
' hear': 16369,
' wisdom': 13007,
' life': 259519,
' important': 60359,
' aspects': 16828,
' messege': 7,
' guide': 23941,
' book': 1502803,
' sufi': 247,
' sermon': 1260,
' much': 267441,
' put': 72396,
' perspective': 25132,
' without': 96163,
' hint': 3692,
' dogma': 1783,
' hints': 4314,
' birth': 10395,
' place': 68095,
' lebanon': 757,
' many': 273330,
' prophets': 1303,
' walked': 3332,
' earth': 23642,
' project': 11050,
' first': 245126,
' germinated': 18,
' likely': 21698,
' probably': 55570,
' becuase': 360,
' written': 124075,
' english': 26152,
' originally': 6357,
' writing': 124357,
' flows': 3164,
' pleasant': 4298,
' read': 467228,
' charcoal': 199,
' drawings': 3051,
' author': 213589,
' decorating': 850,
' pages': 90166,
' plus': 13356,
' loved': 53512,
' cover': 33005,
'first': 13963,
' prophet': 2472,
' kahlil': 47,
' thirty': 16089,
' years': 154925,
' ago': 34136,
' times': 90304,
' since': 82823,
' gibrans': 30,
' messages': 4377,
' timeless': 2536,
' always': 73650,
' influenced': 4131,
' relationships': 18876,
' used': 60893,
' gift': 14420,
' persons': 6685,
' cared': 3425,
' paperback': 5165,
' version': 26043,
' lovely': 7584,
' makes': 102666,
' nice': 28923,
' yet': 81611,
' inexpensive': 1077,
' containing': 2024,
' valuable': 12275,
' lessons': 10936,
' anyone': 72183,
' purchased': 7487,
' three': 116777,
' future': 41106,
' giving': 26821,
' special': 19816,
' price': 19446,
' offered': 7517,
'one': 29098,
' literary': 18325,
' books': 262122,
' recall': 3712,
' reading': 201307,
' mother': 45682,
' kept': 29236,
' collection': 19998,
' works': 51471,
' often': 68942,
' curious': 7600,
' see': 139124,
' attracted': 4663,
' looked': 11790,
' either': 43383,
' eight': 18633,
' nine': 20239,
' time': 286763,
' believe': 60676,
' taste': 10443,
' spirituality': 4324,
' seemed': 35045,
' relevant': 8297,
' forcefed': 69,
' nuns': 944,
' catechism': 598,
' class': 22212,
' rereading': 2992,
' im': 116336,
' struck': 5121,
' notion': 6523,
' hesse': 312,
' must': 84104,
' aware': 13270,
' texts': 6630,
' wrote': 27088,
' siddhartha': 219,
' contain': 5750,
' themes': 10700,
' no': 287112,
' else': 35728,
' path': 13354,
' select': 2125,
' course': 59938,
' preachers': 724,
' dime': 778,
' dozen': 4810,
' true': 72328,
' comes': 55815,
' within': 40910,
' teaching': 12686,
' love': 190146,
' particularly': 28809,
' stage': 9044,
' quot': 235220,
' even': 244133,
' crowns': 96,
' shall': 4036,
' crucify': 93,
' growth': 10497,
' pruning': 165,
' ascends': 79,
' height': 1588,
' caresses': 47,
' tenderest': 7,
' branches': 1188,
' quiver': 105,
' sun': 6416,
' descend': 494,
' roots': 5190,
' shake': 2577,
' clinging': 495,
' like': 386404,
' sheaves': 8,
' corn': 1936,
' gathers': 502,
' unto': 1190,
' threshes': 2,
' make': 166494,
' naked': 3078,
' sifts': 63,
' free': 36836,
' husks': 30,
' grinds': 153,
' whiteness': 111,
' kneads': 5,
' pliant': 38,
' assigns': 343,
' sacred': 4014,
' fire': 13932,
' may': 121525,
' become': 56985,
' bread': 6190,
' gods': 18428,
' feast': 1403,
' look': 73378,
' appear': 10981,
' simplistic': 3906,
' jaundiced': 143,
' eye': 14535,
' also': 267284,
' provide': 21321,
' inspiration': 5941,
' need': 79122,
' lifes': 3357,
' travails': 488,
'prophet': 24,
' best': 118447,
' known': 32026,
' work': 159761,
' western': 17766,
' world': 183103,
' twenty-five': 6750,
' million': 9963,
' copies': 5054,
' sold': 6130,
' said': 62577,
' spent': 21141,
' twenty': 21761,
' held': 12773,
' onto': 7433,
' manuscript': 2788,
' four': 68224,
' finally': 42954,
' releasing': 830,
' publication': 5800,
' referred': 4221,
' strange': 15358,
' little': 148405,
' black': 31348,
' reference': 18498,
' working': 29410,
' counsels': 177,
' final': 25347,
' consisting': 832,
' twenty-six': 2100,
' verses': 2441,
' philosophy': 18910,
' east': 12340,
' west': 15204,
' meet': 18262,
' union': 9084,
' unparalleled': 536,
' literature': 20006,
' early': 44606,
' 20th': 7262,
' acknowledged': 1505,
' multitude': 1372,
' writers': 25652,
' evident': 4068,
' throughout': 39726,
' notably': 2289,
' visions': 2871,
' william': 9740,
' blake': 2842,
' poetprophet': 2,
' parexcellence': 4,
' incidentally': 1158,
' one': 639620,
' kahlils': 1,
' mentors': 582,
' sculptor': 226,
' rodin': 20,
' called': 32643,
' heavily': 5849,
' bible': 35360,
' buddhism': 3450,
' hinduism': 699,
' romantics': 203,
' ralph': 1288,
' waldo': 238,
' emerson': 1135,
' walt': 999,
' whitman': 484,
' friedrich': 272,
' nietzsche': 1714,
' ameen': 2,
' rihani': 1,
' christian': 39797,
' mysticism': 1504,
' published': 24787,
' critic': 2262,
' claude': 435,
' bragdon': 1,
' extraordinary': 6909,
' dramatic': 6103,
' power': 49571,
' deep': 24561,
' erudition': 658,
' lightninglike': 1,
' intuition': 1974,
' lyrical': 2087,
' lift': 1357,
' metrical': 20,
' mastery': 1810,
' message': 19232,
' presented': 19851,
' beauty': 14154,
' permeates': 647,
' entire': 33403,
' pattern': 6878,
' wow': 6258,
' thats': 63277,
' powerful': 24329,
' critique': 4316,
' lets': 15064,
' listen': 7983,
' close': 26063,
' friend': 37304,
' biographers': 721,
' mary': 14250,
' haskell': 154,
' received': 13357,
' copy': 24119,
' beloved': 6165,
' came': 43541,
' today': 35727,
' realize': 18377,
' hopes': 8333,
' compacted': 52,
' form': 27678,
' open': 25058,
' doors': 2656,
' desire': 13562,
' imagination': 9357,
' create': 22298,
' universe': 16756,
' nimbus': 24,
' treasures': 1099,
' darkness': 5908,
' find': 142140,
' heaven': 7713,
' generations': 5818,
' exhaust': 199,
' instead': 57080,
' generation': 10142,
' would': 355169,
' fain': 23,
' better': 116543,
' men': 58579,
' grow': 12150,
' riper': 18,
' loving': 11299,
' ever': 80573,
' add': 22471,
' good': 260350,
' point': 94274,
'gibran': 2,
' khalil': 68,
' born': 13599,
' one thousand': 90898,
' eight hundred and eighty-three': 110,
' northern': 3796,
' nine hundred and nine': 168,
' went': 33696,
' paris': 7052,
' study': 28898,
' strict': 2486,
' education': 15666,
' traveled': 2124,
' eventually': 19153,
' moving': 17314,
' york': 17801,
' became': 28721,
' artist': 8031,
' writer': 43806,
' nine hundred and twenty-three': 256,
' generally': 13389,
' considered': 14084,
' greatest': 14928,
' died': 15257,
' cancer': 8619,
' hospital': 6138,
' young': 64388,
' age': 37449,
' forty-eight': 1177,
' story': 365799,
' almustafa': 5,
' living': 40748,
' twelve': 14218,
' orphalese': 3,
' depart': 335,
' aboard': 1080,
' ship': 7238,
' return': 17272,
' home': 54538,
' goes': 47571,
' group': 28546,
' people': 246448,
' stop': 22483,
' teaches': 6722,
' secrets': 12599,
' writes': 32669,
' poetic': 4100,
' manner': 14473,
' wonderful': 44476,
'days': 141,
' gets': 42791,
' dismissed': 1983,
' hippie': 703,
' bestseller': 2214,
' long': 93686,
' 1960s': 3456,
' almost': 71656,
' instantly': 3618,
' hit': 13199,
' well': 247498,
' great': 193443,
' depression': 9076,
' claim': 13812,
' fame': 4012,
' third': 27316,
' bestselling': 2025,
' poet': 3311,
' behind': 31327,
' shakespeare': 4280,
' lao': 231,
' tzu': 404,
' pretty': 44103,
' entirely': 12133,
' based': 39153,
' sales': 5460,
' publisher': 8838,
' alfred': 1221,
' knopf': 200,
' asked': 11780,
' audience': 13187,
' flippantly': 64,
' question': 32085,
' cult': 3505,
' retorted': 27,
' whats': 17459,
' incredible': 11382,
' theres': 45481,
' absolutely': 22918,
'marketing': 41,
' hype': 2796,
' success': 19908,
' gone': 18759,
'political': 264,
' religious': 26425,
' commercial': 3151,
' enterprise': 2310,
' attached': 2942,
' name': 35366,
' bent': 2511,
' winning': 5164,
' souls': 5259,
' profits': 2334,
' estate': 5824,
' merely': 11912,
' licensing': 186,
' year': 50840,
' response': 8619,
' demand': 4267,
' fueled': 884,
' wordofmouth': 95,
' chance': 18855,
' discovery': 6938,
' fact': 90076,
' twentysix': 187,
' poems': 3351,
' surprising': 7045,
' suprassing': 1,
' relevance': 1914,
' insight': 18771,
' compassion': 5430,
' broken': 9832,
' several': 59339,
' topics': 14968,
' joy': 10679,
' sorrow': 1843,
' etc': 45945,
' recounts': 2561,
' sermons': 898,
' fictional': 7675,
' leaving': 12455,
' knowledge': 32032,
' leaves': 17493,
' homeland': 1765,
' found': 129617,
' setting': 19083,
' poem': 3052,
' children': 56034,
' local': 17859,
' washington': 9560,
' c': 18321,
' singers': 577,
' sweet': 13436,
' honey': 1502,
' rock': 7117,
' album': 1215,
' breaths': 195,
' sons': 6329,
' daughters': 6694,
' longing': 2122,
' come': 83552,
' though': 102839,
' belong': 2524,
' leave': 24599,
' college': 20031,
' eighteen': 6357,
' parents': 32025,
' roof': 1075,
' made': 114888,
' restless': 681,
' autonomy': 509,
' eloquently': 1208,
' expressed': 4155,
' everything': 62445,
' yearning': 1025,
' say': 111806,
' hours': 18005,
' frustration': 3706,
' adolescent': 1824,
' angst': 3332,
' later': 45655,
' proved': 4081,
' turn': 29043,
' needed': 25904,
' confidence': 4952,
' live': 43663,
' independent': 6945,
' fulfilling': 1671,
' still': 121749,
' maintaining': 2503,
' respect': 16022,
' towards': 18507,
' raised': 9845,
' understate': 269,
' grounded': 1943,
' sane': 1269,
' troubling': 1731,
' modern': 40270,
' lives': 57157,
' hectic': 364,
' stressful': 794,
' busy': 4840,
' wrought': 918,
' drama': 10406,
' brings': 18122,
' back': 121302,
' middle': 28798,
' ground': 12214,
' sage': 1216,
' clarity': 5027,
' helpful': 22016,
' unwinding': 134,
' coming': 28159,
' bring': 22670,
' whcih': 23,
' wider': 1979,
' despite': 30524,
' conceit': 644,
' really': 206341,
' applicable': 1781,
' atheist': 3457,
' poetry': 6703,
' mostly': 21684,
' imagistic': 18,
' expect': 20306,
' academic': 9406,
' poetics': 79,
' contemporaries': 1272,
' eliot': 1015,
' pound': 1632,
' frost': 1059,
' aim': 2298,
' accessible': 6915,
' immediate': 4199,
' reader': 112318,
' rely': 3419,
' clear': 44452,
' metaphors': 2452,
' vivid': 8303,
' imagery': 3471,
' hard': 67430,
' perhaps': 52616,
' check': 14321,
' table': 9879,
' contexts': 712,
' using': 40425,
' amazons': 1197,
' search': 14790,
' inside': 18465,
' feature': 4830,
' bookstore': 4368,
' addresses': 5144,
' problem': 48962,
' issue': 22876,
' dealing': 12805,
' start': 49751,
' chances': 2891,
' something': 111135,
' speaks': 7548,
' level': 31632,
'book': 40141,
' took': 37653,
' pains': 1897,
' joys': 1395,
' millions': 7657,
' somehow': 13960,
' managed': 8714,
' encapsulate': 168,
' wonders': 4739,
' unique': 22512,
' explored': 3736,
' every': 110691,
' facet': 802,
' human': 67138,
' existence': 12149,
' although': 60457,
' charm': 3844,
' universal': 5139,
' appeal': 8259,
' allow': 11653,
' persuassion': 4,
' enjoy': 46710,
' gem': 3343,
' wouldnt': 20995,
' surprised': 15362,
' hundred': 10159,
' theologians': 1305,
' unearth': 217,
' conclude': 2650,
'certainly': 315,
' words': 53707,
' eternally': 522,
' present': 29385,
' divine': 6226,
' told': 40137,
' source': 14365,
' sources': 15244,
' ready': 12194,
' receive': 4801,
' destined': 2260,
' know': 143387,
' meaning': 17835,
' meanings': 2400,
' five': 74282,
' valued': 939,
' keep': 61844,
' refering': 132,
' solace': 863,
' peace': 13994,
' rather': 79364,
' instrument': 1092,
' write': 45977,
' reached': 5150,
' original': 30366,
' available': 18320,
' humankind': 996,
' music': 15776,
' truth': 41242,
' salute': 433,
' us': 173033,
' dispenses': 201,
' ultimate': 9136,
' ones': 39616,
' bids': 142,
' fare': 1788,
' defines': 2249,
' never': 154708,
' define': 3509,
' appropriately': 1550,
' sense': 62476,
' discovered': 11310,
' refused': 3266,
' rush': 4242,
' lesson': 7490,
' understanding': 37273,
' ability': 22754,
' way': 215636,
' everyday': 7854,
' dispense': 262,
'ending': 681,
' treasure': 4861,
' myth': 7685,
' whose': 28622,
' word': 39285,
' drips': 184,
' thought': 86511,
' swimming': 1180,
' realitybut': 4,
' reality': 27646,
' till': 3788,
' knew': 31335,
' shared': 6714,
' bit': 75203,
' wonder': 20826,
' simplicity': 2911,
' preaching': 2389,
' able': 50094,
' feel': 82118,
' gut': 2008,
' characters': 208455,
' unsurpassed': 265,
' speaking': 9772,
' ways': 38905,
'known': 896,
' weakest': 1386,
' prayer': 7001,
' certainly': 41800,
' relies': 2441,
' jesus': 37990,
' allusions': 1040,
' wordsand': 15,
' actions': 19352,
' wineskin': 10,
' right': 95314,
' bedrock': 302,
' obviously': 17754,
' enlightened': 2330,
' completely': 34990,
' psychologically': 1111,
' spiritually': 1680,
' healthy': 12481,
' consider': 19483,
' standard': 13168,
' simply': 55025,
' simpleminded': 253,
' high': 49216,
' school': 43008,
' students': 18490,
' gaining': 1886,
' ten': 35773,
' typically': 3572,
' remind': 4110,
' however': 113633,
' gave': 28525,
' son': 29157,
' showed': 6833,
' signs': 4165,
' risk': 10243,
' seventeen': 5481,
' significant': 11129,
' positive': 18157,
' impact': 12259,
' responsible': 7794,
' balanced': 6230,
' telephone': 984,
' evening': 3882,
' passages': 9362,
' think': 153896,
' get': 203875,
' teenager': 4283,
' thing': 77277,
'kahlil': 1,
' eighteen million': 894,
' eight hundred and thirty-one thousand': 32,
' nine hundred and thirty-one': 292,
' lebaneseamerican': 6,
' mystic': 1189,
'wrote': 321,
' anything': 60671,
' finer': 711,
' volume': 20528,
' masterwork': 368,
' nothing': 71359,
' beyond': 29373,
' remembered': 3220,
' perpetuity': 52,
' brief': 13619,
' chapter': 76724,
' aspect': 12907,
' condition': 6247,
' including': 38056,
' marriage': 25393,
' pleasure': 10698,
' buying': 13278,
' selling': 6690,
' eating': 14794,
' drinking': 4903,
' espouses': 370,
'particular': 668,
' ethical': 3198,
' moral': 15820,
' system': 33247,
' includes': 17331,
' slim': 2160,
' tome': 3488,
' seems': 83776,
' quill': 146,
' light': 33388,
' ink': 1620,
' higher': 11493,
' whatever': 17324,
' call': 25326,
' spoke': 3822,
' perfect': 31956,
' letter': 7452,
' ages': 8063,
' expression': 3698,
' humanity': 10541,
'father': 210,
' huge': 20952,
' remember': 20796,
' around': 74445,
' amidst': 1287,
' sea': 9536,
' small': 43935,
' sit': 7001,
' changed': 18328,
' talks': 11414,
' friendship': 8274,
' death': 53006,
' answers': 14887,
' spend': 15495,
' searching': 5804,
' tells': 31320,
' things': 130509,
' already': 48218,
' heads': 5675,
' dont': 178664,
' ignoring': 2832,
' default': 1555,
'place': 959,
' go': 97212,
' doesnt': 96338,
'follow': 1100,
' teacher': 11606,
' despised': 895,
' talking': 17307,
' favorite': 33007,
' got': 64427,
' excited': 8048,
' thinking': 42289,
' common': 27545,
' someone': 59821,
' despise': 991,
' guess': 17668,
'forget': 2503,
' waited': 2678,
' coastal': 554,
' town': 22080,
' bear': 6647,
' misses': 2454,
' waiting': 14088,
' knows': 33424,
' mystery': 39976,
' departs': 322,
' townspeople': 534,
' gather': 2072,
' wish': 28775,
' seeress': 20,
' asks': 7300,
' share': 16983,
' endure': 2545,
' reveals': 7657,
' pain': 16571,
' profound': 7812,
' advice': 26285,
' judge': 6905,
' together': 59216,
' forevermore': 56,
' white': 29848,
' wings': 1952,
' scatter': 141,
' days': 39957,
' aye': 71,
' silent': 3269,
' memory': 11137,
' god': 77025,
' let': 38137,
' spaces': 1242,
' togetherness': 128,
' winds': 1838,
' heavens': 1280,
' dance': 3634,
' another': 113957,
' bond': 5871,
' shores': 647,
' fill': 6672,
' others': 79338,
' cup': 3637,
' drink': 4667,
' give': 83120,
' eat': 17151,
' loaf': 629,
' sing': 1608,
' joyous': 432,
' alone': 28148,
' strings': 1438,
' lute': 52,
' hearts': 5487,
' keeping': 12013,
' hand': 32928,
' stand': 16953,
' near': 15817,
' pillars': 862,
' temple': 3022,
' apart': 12487,
' oak': 481,
' tree': 5364,
' cypress': 67,
' shadow': 5529,
' similar': 22730,
' thetao': 4,
' te': 453,
' ching': 621,
' perennial': 513,
' classics': 4022,
' border': 2805,
' guard': 3452,
' recognises': 115,
' exile': 1652,
' five hundred': 5794,
' translated': 3484,
' tao': 1181,
' contains': 14736,
' principles': 11002,
' use': 82742,
' youre': 47915,
' forget': 13048,
' ego': 3799,
' opens': 6833,
' recommend': 72905,
' stephen': 7900,
' mitchell': 1825,
' hope': 43359,
' useful': 25187,
' menu': 1473,
' lebanese': 256,
' restaurant': 4018,
' sought': 3652,
' fully': 15178,
' parts': 29932,
' barely': 8783,
' touch': 11434,
' onlove': 1,
' insecurities': 1259,
' doubts': 3318,
' going': 96882,
' rough': 4373,
' unanswered': 1984,
' questions': 33958,
' certain': 30623,
' lucid': 2129,
' insightful': 8009,
' foodi': 2,
' thankfully': 3102,
' given': 54338,
' journeyed': 168,
' overseas': 1274,
' spiritual': 21026,
' quest': 8212,
'got': 3843,
' month': 7841,
' lost': 40945,
' job': 57801,
' relationship': 45996,
' unsteady': 71,
' felt': 59471,
' isolated': 3057,
' skipping': 2552,
' sections': 14003,
' immediately': 16292,
' pertained': 79,
' cried': 2788,
' safe': 9035,
' sadness': 2896,
' enlightenment': 3943,
' experiences': 19777,
' thoughts': 20840,
' detailed': 19873,
' accounts': 9831,
' applies': 2914,
' everyone': 45871,
' writings': 8782,
' layers': 3174,
' therefore': 13663,
' breaking': 5331,
' shell': 3811,
' encloses': 21,
'atheist': 46,
' seem': 42743,
' holds': 7370,
' references': 15229,
' walks': 4105,
' relate': 8936,
' prose': 19488,
' greatly': 7090,
' celebrated': 1474,
' countries': 11443,
' simple': 38385,
' biting': 1369,
' phrases': 4413,
' sentences': 8528,
' legacy': 4322,
' thoughtaltering': 1,
' quote': 8122,
' uses': 23670,
' short': 55582,
...})
In [110]:
words = []
freqs = []
In [111]:
for rank, word in enumerate(fd):
words.append(word)
freqs.append(fd[word])
In [112]:
frequencies = {'word': words, 'frequency':freqs}
frequencies_df = pd.DataFrame(frequencies)
In [113]:
frequencies_df.head()
Out[113]:
frequency
word
0
26
timeless
1
17851
classic
2
2481
demanding
3
2899
assuming
4
31208
title
In [114]:
frequencies_df = frequencies_df.sort_values(['frequency'], ascending=[False])
frequencies_df = frequencies_df.reset_index()
frequencies_df = frequencies_df.drop(columns=['index'])
In [115]:
frequencies_df[0:20]
Out[115]:
frequency
word
0
1502803
book
1
639620
one
2
467228
read
3
386404
like
4
365799
story
5
355169
would
6
287112
no
7
286763
time
8
273330
many
9
267441
much
10
267284
also
11
262122
books
12
260350
good
13
259519
life
14
247498
well
15
246448
people
16
245126
first
17
244133
even
18
235220
quot
19
222104
two
In [116]:
import plotly
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
0.12.1
In [117]:
plotly.tools.set_credentials_file(username='falrashidi', api_key='XaO64TRYU0N3Sdup8Z3H')
In [118]:
frequencies_df['frequency'][0:75].iplot(kind='bar', xTitle='Words', yTitle='Frequency', title='Occurences in the Corpus per Word (Zipf\'s Law)')
Out[118]:
In [119]:
frequencies_df.to_csv("../data/interim/003_dictionary.csv", sep='\t', header=True, index=False);
In [128]:
df = frequencies_df.reindex_axis(sorted(frequencies_df.columns, reverse=True), axis=1)
/Users/falehalrashidi/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning:
'.reindex_axis' is deprecated and will be removed in a future version. Use '.reindex' instead.
In [131]:
sorted(frequencies_df.columns, reverse=True)
Out[131]:
['word', 'frequency']
In [133]:
final_df = frequencies_df.reindex(['word', 'frequency'], axis=1)
In [136]:
# Save a dictionary into a pickle file.
final_df.to_pickle("../data/interim/003_dictionary.p")
In [ ]:
# END_OF_FILE
Content source: VictorQuintana91/Thesis
Similar notebooks: