Filtering nouns



In [1]:

    
import pandas as pd



In [2]:

    
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)



In [3]:

    
# df0 = pd.read_pickle('../data/interim/004_synonyms_grouped_1k.p')
df0 = pd.read_pickle('../data/interim/002_keyed_nouns.p')



In [4]:

    
df0.head()









    Out[4]:







  
    
      
      uniqueKey
      reviewText
    
  
  
    
      0
      A2XQ5LZHTD4AFT##000100039X
      [timeless,  gibran,  backs,  content,  means, ...
    
    
      1
      AF7CSSGV93RXN##000100039X
      [ prophet,  kahlil,  gibran,  thirty,  years, ...
    
    
      2
      A1NPNGWBVD9AK3##000100039X
      [ first,  books,  recall,  collection,  gibran...
    
    
      3
      A3IS4WGMFR4X65##000100039X
      [prophet,  kahlil,  work,  world,  million,  c...
    
    
      4
      AWLFVCT9128JV##000100039X
      [gibran,  khalil,  gibran,  born,  one thousan...



In [5]:

    
dictionary_df00 = pd.read_pickle('../data/interim/003_dictionary.p')



In [6]:

    
len(dictionary_df00)









    Out[6]:





822604



In [7]:

    
dictionary_df00.head()

The idea

Words that only appear once cannot be frequent words even in their own context; so they will be filtered out. Then lets calculate the average frequency for the remaining words--remember; this dictionary does not only concern nouns.

Notice: grouping of noun synonyms done in `004_grouping_domain_synonyms` is repeated here once filtering out nouns is applied, since it will take far less time to be applied on the whole dataset once the latter is filter (`004_grouping_domain_synonyms` was aplied only on 1k reviews)



In [8]:

    
dictionary_df00.loc[dictionary_df00['frequency'] > 5].describe()









    Out[8]:







  
    
      
      frequency
    
  
  
    
      count
      1.550540e+05
    
    
      mean
      5.394970e+02
    
    
      std
      6.586737e+03
    
    
      min
      6.000000e+00
    
    
      25%
      1.000000e+01
    
    
      50%
      2.200000e+01
    
    
      75%
      9.100000e+01
    
    
      max
      1.502803e+06



In [9]:

    
dictionary_df00['word'].loc[dictionary_df00['frequency'] > 4].count()









    Out[9]:





172284



In [10]:

    
gt4_dictionary_df01 = dictionary_df00.loc[dictionary_df00['frequency'] > 4]



In [11]:

    
dictionary_df00['frequency'].loc[dictionary_df00['frequency'] > 4].describe()









    Out[11]:





count    1.722840e+05
mean     4.860424e+02
std      6.250750e+03
min      5.000000e+00
25%      8.000000e+00
50%      1.800000e+01
75%      7.400000e+01
max      1.502803e+06
Name: frequency, dtype: float64



In [12]:

    
# Use threshold for first quantile
final_dic = gt4_dictionary_df01.loc[dictionary_df00['frequency'] < 8]
len(final_dic)









    Out[12]:





39890



In [13]:

    
final_dic_df01 = final_dic.assign(normalised = final_dic['frequency'].progress_apply(lambda frequency:frequency/486))
final_dic_df01.head()









    



Progress:: 100%|██████████| 39890/39890 [00:00<00:00, 1326705.15it/s]






    Out[13]:







  
    
      
      word
      frequency
      normalised
    
  
  
    
      132394
      wordlessness
      7
      0.014403
    
    
      132395
      ciasponsored
      7
      0.014403
    
    
      132396
      sophieannes
      7
      0.014403
    
    
      132397
      traster
      7
      0.014403
    
    
      132398
      tedlock
      7
      0.014403

Begin noun filtering



In [14]:

    
df0.head()









    Out[14]:







  
    
      
      uniqueKey
      reviewText
    
  
  
    
      0
      A2XQ5LZHTD4AFT##000100039X
      [timeless,  gibran,  backs,  content,  means, ...
    
    
      1
      AF7CSSGV93RXN##000100039X
      [ prophet,  kahlil,  gibran,  thirty,  years, ...
    
    
      2
      A1NPNGWBVD9AK3##000100039X
      [ first,  books,  recall,  collection,  gibran...
    
    
      3
      A3IS4WGMFR4X65##000100039X
      [prophet,  kahlil,  work,  world,  million,  c...
    
    
      4
      AWLFVCT9128JV##000100039X
      [gibran,  khalil,  gibran,  born,  one thousan...



In [15]:

    
df1 = pd.DataFrame(df0.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df1.head()









    Out[15]:







  
    
      
      userId
      asin
    
  
  
    
      0
      A2XQ5LZHTD4AFT
      000100039X
    
    
      1
      AF7CSSGV93RXN
      000100039X
    
    
      2
      A1NPNGWBVD9AK3
      000100039X
    
    
      3
      A3IS4WGMFR4X65
      000100039X
    
    
      4
      AWLFVCT9128JV
      000100039X



In [16]:

    
df_reviewText = pd.DataFrame(df0['reviewText'])
df_reviewText.head()









    Out[16]:







  
    
      
      reviewText
    
  
  
    
      0
      [timeless,  gibran,  backs,  content,  means, ...
    
    
      1
      [ prophet,  kahlil,  gibran,  thirty,  years, ...
    
    
      2
      [ first,  books,  recall,  collection,  gibran...
    
    
      3
      [prophet,  kahlil,  work,  world,  million,  c...
    
    
      4
      [gibran,  khalil,  gibran,  born,  one thousan...



In [17]:

    
df_new = pd.concat([df1, df_reviewText], axis=1)
df_new.head()









    Out[17]:







  
    
      
      userId
      asin
      reviewText
    
  
  
    
      0
      A2XQ5LZHTD4AFT
      000100039X
      [timeless,  gibran,  backs,  content,  means, ...
    
    
      1
      AF7CSSGV93RXN
      000100039X
      [ prophet,  kahlil,  gibran,  thirty,  years, ...
    
    
      2
      A1NPNGWBVD9AK3
      000100039X
      [ first,  books,  recall,  collection,  gibran...
    
    
      3
      A3IS4WGMFR4X65
      000100039X
      [prophet,  kahlil,  work,  world,  million,  c...
    
    
      4
      AWLFVCT9128JV
      000100039X
      [gibran,  khalil,  gibran,  born,  one thousan...



In [18]:

    
df_new_01 = df_new.assign(wordCountBefore = df_new['reviewText'].progress_apply(lambda review:len(review)))
df_new_01.head()









    



Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 1217178.22it/s]






    Out[18]:







  
    
      
      userId
      asin
      reviewText
      wordCountBefore
    
  
  
    
      0
      A2XQ5LZHTD4AFT
      000100039X
      [timeless,  gibran,  backs,  content,  means, ...
      49
    
    
      1
      AF7CSSGV93RXN
      000100039X
      [ prophet,  kahlil,  gibran,  thirty,  years, ...
      19
    
    
      2
      A1NPNGWBVD9AK3
      000100039X
      [ first,  books,  recall,  collection,  gibran...
      76
    
    
      3
      A3IS4WGMFR4X65
      000100039X
      [prophet,  kahlil,  work,  world,  million,  c...
      142
    
    
      4
      AWLFVCT9128JV
      000100039X
      [gibran,  khalil,  gibran,  born,  one thousan...
      48



In [19]:

    
final_dic_df01['word'] = final_dic_df01['word'].progress_apply(lambda word: word.replace(" ",""))
final_dic_df01 = final_dic_df01.reset_index()
final_dic_df01.head()









    



Progress:: 100%|██████████| 39890/39890 [00:00<00:00, 1211063.08it/s]






    Out[19]:







  
    
      
      index
      word
      frequency
      normalised
    
  
  
    
      0
      132394
      wordlessness
      7
      0.014403
    
    
      1
      132395
      ciasponsored
      7
      0.014403
    
    
      2
      132396
      sophieannes
      7
      0.014403
    
    
      3
      132397
      traster
      7
      0.014403
    
    
      4
      132398
      tedlock
      7
      0.014403



In [20]:

    
filtered_dict = final_dic_df01['word'].to_dict()
inv_filtered_dict = {v: k for k, v in filtered_dict.items()}
inv_filtered_dict









    Out[20]:





{'wordlessness': 0,
 'ciasponsored': 1,
 'sophieannes': 2,
 'traster': 3,
 'tedlock': 4,
 'pestiferous': 5,
 'himselfas': 6,
 'shigeko': 7,
 'poe': 8,
 'aureus': 9,
 'easiertoread': 10,
 'joyrides': 11,
 'simmis': 12,
 '2014genres': 13,
 'pigafetta': 14,
 'wyss': 15,
 'psychodelic': 16,
 'schoool': 17,
 'hjelms': 18,
 'boadt': 19,
 'savona': 20,
 'bettany': 21,
 'teached': 22,
 'pageandahalf': 23,
 'pinch': 24,
 'policyby': 25,
 'usagainstthem': 26,
 'oompaloompas': 27,
 'vitually': 28,
 'buhle': 29,
 'lims': 30,
 'welltitled': 31,
 'costcos': 32,
 'rabbithole': 33,
 'whalens': 34,
 'infomration': 35,
 'rizzolli': 36,
 'laughingdog': 37,
 'gloomies': 38,
 'mugwort': 39,
 'lovescenes': 40,
 'throughit': 41,
 'agress': 42,
 'wellsubstantiated': 43,
 'esbat': 44,
 'sothat': 45,
 'celierian': 46,
 'harlequinjunkie': 47,
 'wisconsinmadison': 48,
 'mandatory': 49,
 'rezzians': 50,
 'sheks': 51,
 'spearman': 52,
 'latisha': 53,
 'pssst': 54,
 'meiss': 55,
 'flutterings': 56,
 'sympton': 57,
 'ethniccleansing': 58,
 'gaslights': 59,
 'posner': 60,
 'makea': 61,
 'grovelled': 62,
 'daetrin': 63,
 'rehabilitative': 64,
 'canyonlands': 65,
 'varity': 66,
 'orthogonian': 67,
 'cordry': 68,
 'cail': 69,
 'parallell': 70,
 'varigated': 71,
 'ididnt': 72,
 'kosmitoras': 73,
 'judaism': 74,
 'rokesmith': 75,
 'redicks': 76,
 'prosecco': 77,
 'nbs': 78,
 'knickerbocker': 79,
 'rugar': 80,
 'shiptoship': 81,
 'bernadino': 82,
 'angermanagement': 83,
 'suskins': 84,
 'nonparametric': 85,
 'noirlike': 86,
 'cardwell': 87,
 'blindnesses': 88,
 'smallworld': 89,
 'phal': 90,
 'wml': 91,
 'ebbtide': 92,
 'factthe': 93,
 'thassas': 94,
 'dekker': 95,
 'selfenhancement': 96,
 'redbeards': 97,
 'sladens': 98,
 '5yr': 99,
 'frazen': 100,
 'hillock': 101,
 'coldbloodedness': 102,
 'mundt': 103,
 'shgall': 104,
 '60lbs': 105,
 'stelian': 106,
 'nextand': 107,
 'koester': 108,
 'kayso': 109,
 'frishman': 110,
 'greengage': 111,
 'gurkhas': 112,
 'loking': 113,
 'expertness': 114,
 'lukan': 115,
 'olena': 116,
 'impishness': 117,
 'burmark': 118,
 'kohima': 119,
 'shilled': 120,
 'rives': 121,
 'goodside': 122,
 'deatils': 123,
 'causal': 124,
 'blowjob': 125,
 'mindkiller': 126,
 'brigerton': 127,
 'waacbo': 128,
 'doorstopsized': 129,
 'efts': 130,
 'alsoand': 131,
 'caughleigh': 132,
 'wondla': 133,
 'esvs': 134,
 'eisley': 135,
 'hardright': 136,
 'responsable': 137,
 'macraes': 138,
 'hkan': 139,
 'sketchpad': 140,
 'holocron': 141,
 'gabrial': 142,
 'foreigness': 143,
 'leake': 144,
 'crowbars': 145,
 'ontheroad': 146,
 'soweto': 147,
 'messanic': 148,
 'antiarmor': 149,
 'hipocracy': 150,
 'tradings': 151,
 'korriban': 152,
 'govenor': 153,
 'welp': 154,
 'laterborns': 155,
 'belowi': 156,
 'thirty-ninethousand': 157,
 'addi': 158,
 'discused': 159,
 'bibiography': 160,
 'unterwegers': 161,
 'brendan': 162,
 'soda': 163,
 'menuges': 164,
 'somewaht': 165,
 'vina': 166,
 'occassionaly': 167,
 'moqtada': 168,
 'andby': 169,
 '2011author': 170,
 'silurian': 171,
 'nightblooming': 172,
 'schurchs': 173,
 'isopropanol': 174,
 'ohmaes': 175,
 '5writing': 176,
 'vezin': 177,
 'gazers': 178,
 'allthemore': 179,
 'shechem': 180,
 'unrolled': 181,
 'alysa': 182,
 'kronnenberg': 183,
 'conure': 184,
 'taillight': 185,
 'sealts': 186,
 'abased': 187,
 'bonewitz': 188,
 'klawans': 189,
 'religiouslike': 190,
 'razorbacks': 191,
 'ldlc': 192,
 'adzuki': 193,
 'nonstatisticians': 194,
 'selfstimulation': 195,
 'greenstreet': 196,
 'hiebert': 197,
 'razia': 198,
 '5quart': 199,
 '150m': 200,
 'lydda': 201,
 'psr': 202,
 'troutman': 203,
 'se7': 204,
 'chanus': 205,
 'softpedaling': 206,
 'woodcourt': 207,
 'belateche': 208,
 'nivs': 209,
 'storiesof': 210,
 'westcentral': 211,
 'corella': 212,
 'benedikt': 213,
 'icequeen': 214,
 'ballons': 215,
 '6lbs': 216,
 'columned': 217,
 'palawan': 218,
 'chathrand': 219,
 'beginningtoend': 220,
 'frangible': 221,
 'amazin': 222,
 'kuti': 223,
 'pseudonymn': 224,
 'tamest': 225,
 'naga': 226,
 'principlebased': 227,
 'egalatarian': 228,
 'itselfas': 229,
 'simcity': 230,
 'dalet': 231,
 'sawn': 232,
 'jewishgentile': 233,
 'tcol': 234,
 'pullouts': 235,
 'knobby': 236,
 'americanenglish': 237,
 'stuffand': 238,
 'twixt': 239,
 'dianne': 240,
 'beppe': 241,
 'refusenik': 242,
 'recommneded': 243,
 'faud': 244,
 'cockfight': 245,
 'circulator': 246,
 'sisinlaw': 247,
 'madsons': 248,
 'cookbookthe': 249,
 'jaywalkers': 250,
 'schaum': 251,
 '11the': 252,
 'sport': 253,
 'bibliograpy': 254,
 'nearreligious': 255,
 'impossibles': 256,
 'voyeuristically': 257,
 'suspsense': 258,
 'repairmen': 259,
 'nonnuclear': 260,
 'phonographic': 261,
 'fallaciously': 262,
 'lebows': 263,
 'behavour': 264,
 'frentis': 265,
 'technologyand': 266,
 'marxengels': 267,
 'tryto': 268,
 'warriorprophet': 269,
 'restauranteurs': 270,
 'colan': 271,
 'ellena': 272,
 'authencity': 273,
 'okbomb': 274,
 'aldi': 275,
 'christinaity': 276,
 'thatthese': 277,
 'clemson': 278,
 'chukrow': 279,
 'perspicaciously': 280,
 'fpws': 281,
 'endeavouring': 282,
 'argonos': 283,
 'schutzs': 284,
 'posssible': 285,
 'p30': 286,
 'weare': 287,
 'countercultures': 288,
 'emaleth': 289,
 'multihued': 290,
 'joshing': 291,
 'druss': 292,
 'hada': 293,
 'largest': 294,
 'chroot': 295,
 'suited': 296,
 'crutchfield': 297,
 'ballantynes': 298,
 'brastemp': 299,
 'huri': 300,
 'parfords': 301,
 'rasenberger': 302,
 'kore': 303,
 'binjamin': 304,
 'oaklands': 305,
 'mitchellcoauthor': 306,
 'starsseries': 307,
 'buggered': 308,
 'bflat': 309,
 'quoteunquote': 310,
 'nilus': 311,
 'charwoman': 312,
 'ede': 313,
 'weismann': 314,
 'tailgate': 315,
 'life4': 316,
 'wildcraft': 317,
 'handstands': 318,
 'speckles': 319,
 'dude': 320,
 'zelana': 321,
 'twoplus': 322,
 'sage': 323,
 'cowled': 324,
 'influentials': 325,
 'modulus': 326,
 'friendships': 327,
 'englishisbn': 328,
 'utilized': 329,
 'sabans': 330,
 'intersting': 331,
 'pappa': 332,
 'meaningless': 333,
 'coochie': 334,
 'hyperlinking': 335,
 'sthe': 336,
 'hightraffic': 337,
 'youof': 338,
 'egoself': 339,
 'superglue': 340,
 'carra': 341,
 'lefthemisphere': 342,
 'korkis': 343,
 'bears': 344,
 'asplundh': 345,
 'hanagarne': 346,
 'aj': 347,
 'nonaristocratic': 348,
 'herselfshe': 349,
 'humilated': 350,
 'adriel': 351,
 'byatts': 352,
 'lexicographical': 353,
 'vulcano': 354,
 'allnot': 355,
 'beginningthe': 356,
 'bloodmages': 357,
 'prinicples': 358,
 'hayness': 359,
 'rubegoldberg': 360,
 'stormchasers': 361,
 'lousia': 362,
 'madrien': 363,
 'overregulated': 364,
 'variola': 365,
 'masklin': 366,
 'brownmillers': 367,
 'wayof': 368,
 'fimiliar': 369,
 'refractor': 370,
 'selectionism': 371,
 'reluctance': 372,
 'vanishings': 373,
 'raelians': 374,
 'peachiness': 375,
 'reisss': 376,
 'ilona': 377,
 'vers': 378,
 'abo': 379,
 'superpeck': 380,
 'biberkopf': 381,
 'shintoism': 382,
 'phyical': 383,
 'tormod': 384,
 'juvenalia': 385,
 'portays': 386,
 'caymens': 387,
 'neapolitans': 388,
 'lhop': 389,
 'whiteskinned': 390,
 'williston': 391,
 'semifictionalized': 392,
 'overreact': 393,
 'dalit': 394,
 'tob': 395,
 'remarkability': 396,
 'grubbs': 397,
 'portholes': 398,
 'greated': 399,
 'thomasalexandre': 400,
 'mutnojmet': 401,
 'massmurderers': 402,
 'brogues': 403,
 'aghia': 404,
 'herer': 405,
 'astronomy': 406,
 'pennywell': 407,
 'amunition': 408,
 'technosavvy': 409,
 'killik': 410,
 'forreal': 411,
 'habitants': 412,
 'vees': 413,
 'eyeballed': 414,
 'toryboy': 415,
 'sofrep': 416,
 'strikingly': 417,
 'happyhappy': 418,
 'banditos': 419,
 'polisher': 420,
 'mustafar': 421,
 '5pound': 422,
 'tracies': 423,
 'lobotomist': 424,
 'joltingly': 425,
 'buh': 426,
 'orchestrations': 427,
 'twoweapon': 428,
 'muscovites': 429,
 'teppics': 430,
 'chronologically': 431,
 'deduct': 432,
 'undergo': 433,
 'silverwood': 434,
 'deatha': 435,
 'antinausea': 436,
 'peninsulas': 437,
 'personell': 438,
 'cunaxa': 439,
 'nerf': 440,
 'rothfusss': 441,
 'leyers': 442,
 'higginss': 443,
 'holo': 444,
 'hayley': 445,
 'grazida': 446,
 'lempriere': 447,
 '5gtotal': 448,
 'haugen': 449,
 'meed': 450,
 'deleos': 451,
 'wroughtiron': 452,
 'agreeing': 453,
 'duelists': 454,
 'muscovite': 455,
 'albertfor': 456,
 'goateed': 457,
 'agendae': 458,
 'resourcing': 459,
 'negrons': 460,
 'scaleable': 461,
 'packham': 462,
 'grattan': 463,
 '5kg': 464,
 'clayers': 465,
 'pesty': 466,
 'sixshooters': 467,
 'abridgments': 468,
 'sekigahara': 469,
 'bloodynine': 470,
 'thatwould': 471,
 'glotka': 472,
 'emboss': 473,
 'doto': 474,
 'erno': 475,
 'quillers': 476,
 'herbivoracious': 477,
 'vonni': 478,
 'landholding': 479,
 'offloom': 480,
 'marielle': 481,
 'alumnae': 482,
 'missionary': 483,
 'bezetov': 484,
 'chinamen': 485,
 'gus': 486,
 'andwith': 487,
 'hansulrich': 488,
 'somnolence': 489,
 'chastizing': 490,
 'hdx': 491,
 'alainn': 492,
 'mateguas': 493,
 'microtension': 494,
 'heyman': 495,
 'odoherty': 496,
 'brobdingnagian': 497,
 'rellenos': 498,
 'whitemans': 499,
 'contast': 500,
 'hetchins': 501,
 'geagley': 502,
 'amazoners': 503,
 'servant': 504,
 'industriously': 505,
 'sto': 506,
 'gadflies': 507,
 'brynley': 508,
 'delica': 509,
 'pinol': 510,
 'alexandermoegerle': 511,
 'togetherit': 512,
 'smattered': 513,
 'britishism': 514,
 'pimm': 515,
 'tranquilized': 516,
 'motorways': 517,
 'andau': 518,
 'geserit': 519,
 'soult': 520,
 'chiefofstaff': 521,
 'expanation': 522,
 'boop': 523,
 'productplacement': 524,
 'tschichold': 525,
 'pretoria': 526,
 'inservice': 527,
 'fiddlehead': 528,
 'advancement': 529,
 'pffft': 530,
 'abridges': 531,
 'dorning': 532,
 'sugarville': 533,
 'jemiah': 534,
 'suppurating': 535,
 'manipulatives': 536,
 'phonicsbased': 537,
 'responsiblefor': 538,
 'worksit': 539,
 'semiacademic': 540,
 'merope': 541,
 'expence': 542,
 'oharrow': 543,
 'kinselection': 544,
 'procedings': 545,
 'practive': 546,
 'brainman': 547,
 'vascillates': 548,
 'tacker': 549,
 'suborbital': 550,
 'waaaayyy': 551,
 'reguardless': 552,
 'kharijites': 553,
 'succcess': 554,
 'eggregious': 555,
 'p212': 556,
 'spellbindingly': 557,
 'pachita': 558,
 'slainte': 559,
 'queenthe': 560,
 'samhita': 561,
 'noufs': 562,
 'icewater': 563,
 'descripton': 564,
 'posthippie': 565,
 'jhs': 566,
 'inscape': 567,
 'marchese': 568,
 'p139': 569,
 'cunnane': 570,
 'stigmatised': 571,
 'usmle': 572,
 'pajaritas': 573,
 'biogas': 574,
 'greedo': 575,
 'stamos': 576,
 'shirkers': 577,
 'decontamination': 578,
 'fugui': 579,
 'pograms': 580,
 'uncapable': 581,
 'halfwritten': 582,
 'werecoyote': 583,
 'esotericists': 584,
 'wouldlike': 585,
 'thecase': 586,
 'unflavored': 587,
 'feiges': 588,
 'diffee': 589,
 'prinzel': 590,
 'enoyable': 591,
 'paser': 592,
 'buruus': 593,
 'sah': 594,
 'fastaction': 595,
 'americn': 596,
 'webmage': 597,
 'sluething': 598,
 'alternadad': 599,
 'glamourized': 600,
 'growthmindset': 601,
 'manxs': 602,
 'telecommute': 603,
 '10mg': 604,
 'lowerpaid': 605,
 'labratory': 606,
 'wetzel': 607,
 'oversentimentality': 608,
 'alyce': 609,
 'seethat': 610,
 'americansand': 611,
 'p27': 612,
 'narrows': 613,
 'rashleigh': 614,
 'hieber': 615,
 'hangnails': 616,
 'bogies': 617,
 'unappealingly': 618,
 'wristwatches': 619,
 'levitated': 620,
 'multiplexes': 621,
 'edells': 622,
 'outdated': 623,
 'istria': 624,
 'storyteller': 625,
 'tariq': 626,
 'metroplex': 627,
 'dumba': 628,
 'voldemorts': 629,
 'litteraly': 630,
 'giulias': 631,
 'condensate': 632,
 'ih': 633,
 'wyndanos': 634,
 'pagone': 635,
 'lull': 636,
 'applebee': 637,
 'elyse': 638,
 'fangtasia': 639,
 'milleu': 640,
 'protected': 641,
 'kamon': 642,
 'osama': 643,
 'roid': 644,
 'dyans': 645,
 'bombmaker': 646,
 'macintryes': 647,
 'exlaw': 648,
 'vulpine': 649,
 'sata': 650,
 'korelitzs': 651,
 'toput': 652,
 'flouridation': 653,
 '41s': 654,
 'cants': 655,
 'thiry': 656,
 'impacted': 657,
 'linus': 658,
 'riverting': 659,
 'junkscience': 660,
 'mcelvaines': 661,
 'copyandpaste': 662,
 'hitchen': 663,
 'unrra': 664,
 'gbs': 665,
 'auvers': 666,
 'kak': 667,
 'ql': 668,
 'redmeat': 669,
 'billiejo': 670,
 'lacerates': 671,
 'commitmentphobia': 672,
 'organisers': 673,
 'firststrike': 674,
 'achivements': 675,
 'lammle': 676,
 'romanceaholic': 677,
 'steeps': 678,
 'labyrinth': 679,
 'ischia': 680,
 'myrers': 681,
 'principe': 682,
 'einmal': 683,
 'fruitfly': 684,
 '91st': 685,
 'clifford': 686,
 'amberstones': 687,
 'postimpressionist': 688,
 'nearsuicidal': 689,
 'tigres': 690,
 'saxby': 691,
 'whiteonwhite': 692,
 'mankilling': 693,
 'kammer': 694,
 'rashelle': 695,
 'rebbetzin': 696,
 'considerd': 697,
 'faithwhich': 698,
 'tnh': 699,
 'everyoneeven': 700,
 'joonas': 701,
 'souads': 702,
 'rockerfeller': 703,
 'prepolitical': 704,
 'heggan': 705,
 'koreanstyle': 706,
 'fastapproaching': 707,
 'mccain': 708,
 'tsien': 709,
 'fireroasted': 710,
 'nasv': 711,
 'cabinetlevel': 712,
 'jerkier': 713,
 'debtbased': 714,
 'beancounter': 715,
 'rundowns': 716,
 'paolas': 717,
 'lorrimer': 718,
 'revoloution': 719,
 'unclosed': 720,
 'countrybycountry': 721,
 'prechopped': 722,
 'fitzjames': 723,
 'misallocated': 724,
 'endstate': 725,
 'northen': 726,
 'mindbeauty': 727,
 'levar': 728,
 'loveyou': 729,
 'interventional': 730,
 'gah': 731,
 'initialized': 732,
 'squints': 733,
 'youget': 734,
 'lgs': 735,
 'utina': 736,
 'corruptibility': 737,
 'thigns': 738,
 'biologys': 739,
 'mafiastyle': 740,
 'preview': 741,
 'bait': 742,
 'rabbis': 743,
 'okinawas': 744,
 'starbursts': 745,
 'nissinen': 746,
 'motility': 747,
 'condemning': 748,
 'shoaff': 749,
 'ie5': 750,
 'quasivictorian': 751,
 'secunda': 752,
 'largetype': 753,
 'cons': 754,
 'realscape': 755,
 'annoted': 756,
 'horserace': 757,
 'obsessive': 758,
 'mayaguez': 759,
 'gilletteauthor': 760,
 'linquistic': 761,
 'sexaddict': 762,
 'spanishstyle': 763,
 'blixs': 764,
 'litwin': 765,
 'beeper': 766,
 'moderation': 767,
 'understandibly': 768,
 'dragout': 769,
 'bugialli': 770,
 '000ft': 771,
 'susah': 772,
 'psywar': 773,
 'tepesh': 774,
 'desley': 775,
 'thenand': 776,
 'graetz': 777,
 'redacting': 778,
 'osler': 779,
 'inabilty': 780,
 'p23': 781,
 'buttonpushing': 782,
 'finanacial': 783,
 'kira': 784,
 'medstar': 785,
 'abouti': 786,
 'preferrable': 787,
 'ummayad': 788,
 'tormance': 789,
 'ashbrook': 790,
 'bardell': 791,
 'treelike': 792,
 'lengthiness': 793,
 'cybermen': 794,
 'nambula': 795,
 'molins': 796,
 'indichova': 797,
 '86yearold': 798,
 'waksal': 799,
 'p185': 800,
 'roerich': 801,
 'martensons': 802,
 'anthracite': 803,
 'gallerstein': 804,
 'mayham': 805,
 'introducer': 806,
 'businessworld': 807,
 'poirot': 808,
 'throe': 809,
 'outofthebody': 810,
 'midwife': 811,
 'mj': 812,
 'tayeb': 813,
 'skyisfalling': 814,
 'boxcutter': 815,
 'mouselike': 816,
 'antihistory': 817,
 'tourvel': 818,
 'ramban': 819,
 'emplacement': 820,
 'stewarts': 821,
 'esto': 822,
 'marco': 823,
 'mengers': 824,
 'dips': 825,
 'tonios': 826,
 'minervini': 827,
 'alatristes': 828,
 'spyplane': 829,
 'anglophobia': 830,
 'murel': 831,
 'bodo': 832,
 'readmore': 833,
 'ual': 834,
 'statecentric': 835,
 'uncrossable': 836,
 'greatpower': 837,
 'totting': 838,
 'unproveable': 839,
 'chainofcommand': 840,
 'allgrain': 841,
 'committing': 842,
 'appologize': 843,
 'captious': 844,
 'meddlers': 845,
 'blueribbon': 846,
 'problemas': 847,
 'proletariats': 848,
 'orner': 849,
 'blabla': 850,
 'negitive': 851,
 'heirophant': 852,
 'stross': 853,
 'trammeled': 854,
 'gringrich': 855,
 'ninetyminute': 856,
 'councillors': 857,
 'foresighted': 858,
 'mister': 859,
 'biogaphy': 860,
 'yds': 861,
 'stength': 862,
 'scourby': 863,
 'barrymores': 864,
 'scourbys': 865,
 'hosps': 866,
 'kossmann': 867,
 'romanovich': 868,
 'hermother': 869,
 'identiy': 870,
 'attendence': 871,
 'foreknow': 872,
 'tarpaper': 873,
 'rawlinss': 874,
 'rahabs': 875,
 'everfaithful': 876,
 'nonspecialized': 877,
 'anthropomorphising': 878,
 'leastunderstood': 879,
 'environmental': 880,
 'yn': 881,
 'drennan': 882,
 'sigfrid': 883,
 'distinctly': 884,
 'magritte': 885,
 'clealy': 886,
 'deeping': 887,
 'lowfunctioning': 888,
 'subby': 889,
 'larken': 890,
 'testators': 891,
 'shite': 892,
 'intentially': 893,
 'abna': 894,
 'birthpangs': 895,
 'diseconomies': 896,
 'overtechnical': 897,
 'mannering': 898,
 'construal': 899,
 'nottoosubtle': 900,
 'buccaneering': 901,
 'maidservants': 902,
 'pramal': 903,
 'budhism': 904,
 'prophets': 905,
 'genada': 906,
 'taked': 907,
 'valise': 908,
 'sportswear': 909,
 'bascom': 910,
 'cucullus': 911,
 'hokitika': 912,
 'tattooes': 913,
 'immobilization': 914,
 'tooltips': 915,
 'denis': 916,
 'posteritys': 917,
 'faversham': 918,
 'guma': 919,
 'demise': 920,
 'kirrick': 921,
 'lowther': 922,
 'dars': 923,
 'mindlessly': 924,
 'lincel': 925,
 'gincy': 926,
 'finneran': 927,
 'dishonoured': 928,
 'luddite': 929,
 'sixgun': 930,
 'kelman': 931,
 'bootstrapped': 932,
 'rustics': 933,
 'beleaguer': 934,
 'lollapalooza': 935,
 'helos': 936,
 'mayor': 937,
 'turbotax': 938,
 'robet': 939,
 'kalush': 940,
 'geta': 941,
 'strayers': 942,
 'similarlooking': 943,
 'governmentprovided': 944,
 'stettin': 945,
 'vandenberg': 946,
 'nightshirt': 947,
 'grounded': 948,
 'shuya': 949,
 'robt': 950,
 'pachakuti': 951,
 'liliane': 952,
 'indetail': 953,
 'moocs': 954,
 'netgallery': 955,
 'infastructure': 956,
 'heinleins': 957,
 'reminiscenses': 958,
 'specifially': 959,
 'materialistically': 960,
 'xylaras': 961,
 'shinigami': 962,
 'myrick': 963,
 'barrytown': 964,
 'astrogator': 965,
 'tenniels': 966,
 'machiavellians': 967,
 'neurotoxin': 968,
 'unpremeditated': 969,
 'contrasty': 970,
 'avedon': 971,
 'yvonne': 972,
 'maldivian': 973,
 'kaname': 974,
 'weeny': 975,
 'coeditors': 976,
 'drows': 977,
 'mountains': 978,
 'saltfree': 979,
 'woodalls': 980,
 'familyas': 981,
 'mendenhall': 982,
 'sixthgraders': 983,
 'dtui': 984,
 'protestation': 985,
 'speedreaders': 986,
 'pertwee': 987,
 'leones': 988,
 'trainwrecks': 989,
 'revamps': 990,
 'onesie': 991,
 'lamottes': 992,
 'insitute': 993,
 'severns': 994,
 'nonjudgmentalism': 995,
 'percabeth': 996,
 'blank': 997,
 'briec': 998,
 'erradicate': 999,
 ...}



In [21]:

    
def filter_words(review):
    new_review = []
    for word in review:
        word = word.strip()
        if word in inv_filtered_dict:
            new_review.append(word)
    return new_review



In [22]:

    
df_new_02 = df_new_01.assign(filteredText = df_new_01['reviewText'].progress_apply(lambda review:filter_words(review)))









    



Progress:: 100%|██████████| 582711/582711 [00:10<00:00, 57569.25it/s]



In [23]:

    
df_new_03 = df_new_02.assign(wordCountAfter = df_new_02['filteredText'].progress_apply(lambda review:len(review)))
df_new_03[0:20]









    



Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 1287602.10it/s]






    Out[23]:







  
    
      
      userId
      asin
      reviewText
      wordCountBefore
      filteredText
      wordCountAfter
    
  
  
    
      0
      A2XQ5LZHTD4AFT
      000100039X
      [timeless,  gibran,  backs,  content,  means, ...
      49
      [messege, sermon, prophets, flows]
      4
    
    
      1
      AF7CSSGV93RXN
      000100039X
      [ prophet,  kahlil,  gibran,  thirty,  years, ...
      19
      []
      0
    
    
      2
      A1NPNGWBVD9AK3
      000100039X
      [ first,  books,  recall,  collection,  gibran...
      76
      [catechism, texts, siddhartha, preachers, prop...
      8
    
    
      3
      A3IS4WGMFR4X65
      000100039X
      [prophet,  kahlil,  work,  world,  million,  c...
      142
      [claude, mastery, biographers]
      3
    
    
      4
      AWLFVCT9128JV
      000100039X
      [gibran,  khalil,  gibran,  born,  one thousan...
      48
      [almustafa]
      1
    
    
      5
      AFY0BT42DDYZV
      000100039X
      [days,  gibrans,  gets,  literature,  yet,  bo...
      177
      [profits, twentysix, sage, metaphors]
      4
    
    
      6
      A25P6DY6ARTCGZ
      000100039X
      [book,  gibran,  took,  millions,  encapsulate...
      29
      []
      0
    
    
      7
      A1SP45I55GQIIE
      000100039X
      [ words,  kahlil,  gibran,  divine,  wisdom,  ...
      35
      [meanings]
      1
    
    
      8
      A2E71VWXO59342
      000100039X
      [prophet,  dispenses,  wisdom,  ones,  bids,  ...
      29
      []
      0
    
    
      9
      A2OP1HD9RGX5OW
      000100039X
      [book,  myth,  work,  beauty,  whose,  every, ...
      42
      [simplicity, relies]
      2
    
    
      10
      A2052JNVUPRTMT
      000100039X
      [ gets,  bedrock,  man,  prophet,  anyone,  wo...
      43
      []
      0
    
    
      11
      AGKPTMTR3UX1R
      000100039X
      [kahlil,  eighteen million,  poet,  mystic,  n...
      50
      []
      0
    
    
      12
      A1HS49P9TZRGV9
      000100039X
      [ book,  collection,  remember,  around,  twel...
      67
      [default]
      1
    
    
      13
      A2ZZHMT58ZMVCZ
      000100039X
      [prophet,  years,  ship,  back,  homeland,  kn...
      111
      [departs, pillars, te, exile]
      4
    
    
      14
      A3W43PSHRIG8KV
      000100039X
      [ aware,  kahlil,  gibran,  read,  poem,  menu...
      46
      [menu]
      1
    
    
      15
      A1TR1LU2JSZLUL
      000100039X
      [book,  gift,  journeyed,  overseas,  quest,  ...
      39
      [overseas]
      1
    
    
      16
      ADIDQRLLR4KBQ
      000100039X
      [atheist,  may,  seem,  strange,  people,  boo...
      90
      [phrases, metaphors, prophets]
      3
    
    
      17
      A3AW2ZG0GP4SKN
      000100039X
      [ book,  son,  stolen,  despair,  resonance,  ...
      11
      [resonance]
      1
    
    
      18
      A2MMON52VMO7NT
      000100039X
      [gibrans,  words,  lay,  bare,  simplicity,  t...
      23
      [simplicity, enrich, hail]
      3
    
    
      19
      AR72Z89LACZ8Q
      000100039X
      [ departure,  prophet,  people,  gather,  arou...
      26
      []
      0



In [24]:

    
remaining = 1 - df_new_03['wordCountAfter'].sum() / df_new_03['wordCountBefore'].sum()



In [25]:

    
print("Average noun reduction achieved:" + str(remaining*100) + "%")









    



Average noun reduction achieved:95.95373520483005%

Association Rules Mining Filtering



In [26]:

    
df_books_bigReviews = pd.DataFrame(df_new_03[['asin','filteredText']].groupby(['asin'])['filteredText'].progress_apply(list))
df_books_bigReviews = df_books_bigReviews.reset_index()
df_books_bigReviews = df_books_bigReviews.assign(transactions = df_books_bigReviews['filteredText'].progress_apply(lambda reviews_lis:len(reviews_lis)))
df_books_bigReviews.head()









    



Progress:: 100%|█████████▉| 59324/59325 [00:02<00:00, 27837.71it/s]
Progress:: 100%|██████████| 59324/59324 [00:00<00:00, 1308829.53it/s]






    Out[26]:







  
    
      
      asin
      filteredText
      transactions
    
  
  
    
      0
      000100039X
      [[messege, sermon, prophets, flows], [], [cate...
      30
    
    
      1
      0002051850
      [[periods, progresses, usage, thee, virtues, a...
      31
    
    
      2
      0002113570
      [[], [continues, usfor, continues], [behavior]...
      7
    
    
      3
      0002117088
      [[surgery, goodnight, claude, claude, sorts, t...
      5
    
    
      4
      000215725X
      [[], [], [fraser, fraser, perpetual, fraser, f...
      11



In [27]:

    
from apyori import apriori

# Support
# Support is an indication of how frequently the itemset appears in the dataset.
# Confidence
# Confidence is an indication of how often the rule has been found to be true.
# Lift
# The ratio of the observed support to that expected if X and Y were independent.
def apply_arm(transactions):
    return list(apriori(transactions, min_support = 1/len(transactions), min_confidence = 1, min_lift = len(transactions), max_length = 4))



In [28]:

    
books_with_arm = df_books_bigReviews.assign(arm = df_books_bigReviews['filteredText'].progress_apply(lambda list_of_reviews:apply_arm(list_of_reviews)))
books_with_arm.head()









    



Progress:: 100%|██████████| 59324/59324 [5:25:02<00:00,  3.04it/s]     






    Out[28]:







  
    
      
      asin
      filteredText
      transactions
      arm
    
  
  
    
      0
      000100039X
      [[messege, sermon, prophets, flows], [], [cate...
      30
      [((speaker, arabic), 0.03333333333333333, [Ord...
    
    
      1
      0002051850
      [[periods, progresses, usage, thee, virtues, a...
      31
      [((19yearolds, muck), 0.03225806451612903, [Or...
    
    
      2
      0002113570
      [[], [continues, usfor, continues], [behavior]...
      7
      [((homo, ancestors), 0.14285714285714285, [Ord...
    
    
      3
      0002117088
      [[surgery, goodnight, claude, claude, sorts, t...
      5
      [((goodnight, claude), 0.2, [OrderedStatistic(...
    
    
      4
      000215725X
      [[], [], [fraser, fraser, perpetual, fraser, f...
      11
      [((17th, colony), 0.09090909090909091, [Ordere...



In [29]:

    
def get_important_nouns(arms):
    imp_nns = []
    if "items" in pd.DataFrame(arms).keys():
        results = list(pd.DataFrame(arms)['items'])
        for result in results:
            if len(list(result)) > 4:
                imp_nns = imp_nns + list(list(result))
        if(len(imp_nns)==0):
            for result in results:
                if len(list(result)) > 3:
                    imp_nns = imp_nns + list(list(result))            
        return list(set(imp_nns))
    return list(set(imp_nns))



In [30]:

    
imp_nns_df = books_with_arm.assign(imp_nns = books_with_arm['arm']
                                   .progress_apply(lambda arms:get_important_nouns(arms)))
imp_nns_df.head()









    



Progress:: 100%|██████████| 59324/59324 [13:34:44<00:00,  1.21it/s]      






    Out[30]:







  
    
      
      asin
      filteredText
      transactions
      arm
      imp_nns
    
  
  
    
      0
      000100039X
      [[messege, sermon, prophets, flows], [], [cate...
      30
      [((speaker, arabic), 0.03333333333333333, [Ord...
      [kneads, profits, preachers, territory, exile,...
    
    
      1
      0002051850
      [[periods, progresses, usage, thee, virtues, a...
      31
      [((19yearolds, muck), 0.03225806451612903, [Or...
      [declarations, towns, smaller, threatens, desi...
    
    
      2
      0002113570
      [[], [continues, usfor, continues], [behavior]...
      7
      [((homo, ancestors), 0.14285714285714285, [Ord...
      [humane, homo, ancestors, michener]
    
    
      3
      0002117088
      [[surgery, goodnight, claude, claude, sorts, t...
      5
      [((goodnight, claude), 0.2, [OrderedStatistic(...
      [surgery, sorts, goodnight, virtues, translato...
    
    
      4
      000215725X
      [[], [], [fraser, fraser, perpetual, fraser, f...
      11
      [((17th, colony), 0.09090909090909091, [Ordere...
      [treachery, fort, emperors, 17th, uk, mundane,...



In [31]:

    
imp_nns_df = imp_nns_df[['asin','imp_nns']]
imp_nns_df.head()









    Out[31]:







  
    
      
      asin
      imp_nns
    
  
  
    
      0
      000100039X
      [kneads, profits, preachers, territory, exile,...
    
    
      1
      0002051850
      [declarations, towns, smaller, threatens, desi...
    
    
      2
      0002113570
      [humane, homo, ancestors, michener]
    
    
      3
      0002117088
      [surgery, sorts, goodnight, virtues, translato...
    
    
      4
      000215725X
      [treachery, fort, emperors, 17th, uk, mundane,...



In [32]:

    
imp_nns_df.to_pickle("../data/interim/005_important_nouns.p")



In [33]:

    
imp_nns_df = imp_nns_df.assign(num_of_imp_nouns = imp_nns_df['imp_nns'].progress_apply(lambda imp_nouns:len(imp_nouns)))
imp_nns_df.head()









    



Progress:: 100%|██████████| 59324/59324 [00:00<00:00, 1183158.14it/s]






    Out[33]:







  
    
      
      asin
      imp_nns
      num_of_imp_nouns
    
  
  
    
      0
      000100039X
      [kneads, profits, preachers, territory, exile,...
      26
    
    
      1
      0002051850
      [declarations, towns, smaller, threatens, desi...
      73
    
    
      2
      0002113570
      [humane, homo, ancestors, michener]
      4
    
    
      3
      0002117088
      [surgery, sorts, goodnight, virtues, translato...
      7
    
    
      4
      000215725X
      [treachery, fort, emperors, 17th, uk, mundane,...
      39

Some more stats



In [34]:

    
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')



In [36]:

    
# Filter out synonyms again



In [38]:

    
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] == 0]
len(booksWithNoImportantNouns)









    Out[38]:





10385



In [39]:

    
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] != 0]
len(booksWithNoImportantNouns)









    Out[39]:





48939



In [41]:

    
booksWithNoImportantNouns[0:20]









    Out[41]:







  
    
      
      asin
      imp_nns
      num_of_imp_nouns
    
  
  
    
      0
      000100039X
      [kneads, profits, preachers, territory, exile,...
      26
    
    
      1
      0002051850
      [declarations, towns, smaller, threatens, desi...
      73
    
    
      2
      0002113570
      [humane, homo, ancestors, michener]
      4
    
    
      3
      0002117088
      [surgery, sorts, goodnight, virtues, translato...
      7
    
    
      4
      000215725X
      [treachery, fort, emperors, 17th, uk, mundane,...
      39
    
    
      5
      0002219417
      [humanlevel, smaller, conversion, periods, lic...
      32
    
    
      6
      000222383X
      [treasons, construct, expansion, captains, fav...
      11
    
    
      7
      0002226618
      [coward, towering, territory, papers, macdonal...
      23
    
    
      8
      000224053X
      [fundamentalists, coast, pioneer, inconsistenc...
      81
    
    
      9
      0002242052
      [stretches, authorities, ludlum, drugdealers, ...
      14
    
    
      10
      0002311216
      [andersons, espionage, poirot, worldwide, open...
      16
    
    
      11
      0002550938
      [fights, mysery, pollution, cusslers, threads,...
      6
    
    
      12
      000255383X
      [ubi, shark, merge, beeper, phrases]
      5
    
    
      13
      0002621509
      [surgery, espionage, dolgun, paranoid, employe...
      8
    
    
      14
      0002726874
      [macdonalds, aboot, ye, smaller, theer, uk, al...
      63
    
    
      15
      0002727463
      [violets, armies, elizabethan, 17th, remarks, ...
      43
    
    
      17
      0004723724
      [forth, profundis, masterpieces, shorter, soci...
      6
    
    
      18
      000612609X
      [homers, mastery, remarks, brighteyes, mirskya...
      38
    
    
      19
      0006135129
      [merchant, satisfies, port, ratings, atonement...
      15
    
    
      20
      0006136389
      [neanderthals, rat, authorities, remarks, phil...
      40



In [42]:

    
booksWithNoImportantNouns['num_of_imp_nouns'].iplot(kind='histogram', bins=100, xTitle='Number of Important Nouns', yTitle='Number of Books')









    



/Users/falehalrashidi/anaconda3/lib/python3.6/site-packages/plotly/plotly/plotly.py:224: UserWarning:

Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points

If the visualization you're using aggregates points (e.g., box plot, histogram, etc.) you can disregard this warning.







    Out[42]:



In [43]:

    
booksWithNoImportantNouns.describe()









    Out[43]:







  
    
      
      num_of_imp_nouns
    
  
  
    
      count
      48939.000000
    
    
      mean
      19.532663
    
    
      std
      17.880293
    
    
      min
      4.000000
    
    
      25%
      8.000000
    
    
      50%
      14.000000
    
    
      75%
      25.000000
    
    
      max
      226.000000



In [ ]:

	uniqueKey	reviewText
0	A2XQ5LZHTD4AFT##000100039X	[timeless, gibran, backs, content, means, ...
1	AF7CSSGV93RXN##000100039X	[ prophet, kahlil, gibran, thirty, years, ...
2	A1NPNGWBVD9AK3##000100039X	[ first, books, recall, collection, gibran...
3	A3IS4WGMFR4X65##000100039X	[prophet, kahlil, work, world, million, c...
4	AWLFVCT9128JV##000100039X	[gibran, khalil, gibran, born, one thousan...

	frequency
count	1.550540e+05
mean	5.394970e+02
std	6.586737e+03
min	6.000000e+00
25%	1.000000e+01
50%	2.200000e+01
75%	9.100000e+01
max	1.502803e+06

	word	frequency	normalised
132394	wordlessness	7	0.014403
132395	ciasponsored	7	0.014403
132396	sophieannes	7	0.014403
132397	traster	7	0.014403
132398	tedlock	7	0.014403

	userId	asin
0	A2XQ5LZHTD4AFT	000100039X
1	AF7CSSGV93RXN	000100039X
2	A1NPNGWBVD9AK3	000100039X
3	A3IS4WGMFR4X65	000100039X
4	AWLFVCT9128JV	000100039X

	asin	filteredText	transactions
0	000100039X	[[messege, sermon, prophets, flows], [], [cate...	30
1	0002051850	[[periods, progresses, usage, thee, virtues, a...	31
2	0002113570	[[], [continues, usfor, continues], [behavior]...	7
3	0002117088	[[surgery, goodnight, claude, claude, sorts, t...	5
4	000215725X	[[], [], [fraser, fraser, perpetual, fraser, f...	11

	asin	filteredText	transactions	arm
0	000100039X	[[messege, sermon, prophets, flows], [], [cate...	30	[((speaker, arabic), 0.03333333333333333, [Ord...
1	0002051850	[[periods, progresses, usage, thee, virtues, a...	31	[((19yearolds, muck), 0.03225806451612903, [Or...
2	0002113570	[[], [continues, usfor, continues], [behavior]...	7	[((homo, ancestors), 0.14285714285714285, [Ord...
3	0002117088	[[surgery, goodnight, claude, claude, sorts, t...	5	[((goodnight, claude), 0.2, [OrderedStatistic(...
4	000215725X	[[], [], [fraser, fraser, perpetual, fraser, f...	11	[((17th, colony), 0.09090909090909091, [Ordere...

	asin	imp_nns
0	000100039X	[kneads, profits, preachers, territory, exile,...
1	0002051850	[declarations, towns, smaller, threatens, desi...
2	0002113570	[humane, homo, ancestors, michener]
3	0002117088	[surgery, sorts, goodnight, virtues, translato...
4	000215725X	[treachery, fort, emperors, 17th, uk, mundane,...

	num_of_imp_nouns
count	48939.000000
mean	19.532663
std	17.880293
min	4.000000
25%	8.000000
50%	14.000000
75%	25.000000
max	226.000000