FILE READER

The following code will read several books in .pdf format to later compute the frequency of the words by language.


In [1]:
import os
import pandas as pd
import time
from tqdm import tqdm_notebook as tqdm

# specify the folder's directory where the book files are located
book_dir = './Books'

# create two empty Dataframes to later store the info computed from every book file
count_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])
stat_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'process_time',
                                    'uniq_words','total_words'])

time_start = time.time()
time_accum = 0

# iterate and read every file by language, author, and title
for language in tqdm(os.listdir(book_dir)):
    for author in os.listdir(book_dir + '/' + language):
        for title in os.listdir(book_dir + '/' + language + '/' + author):
            
            # this is the resulting path...
            title_path = book_dir + '/' + language + '/' + author + '/' + title
            
            # now it will read on every file
            with open(title_path, 'r', encoding='utf8') as current_file:
                text = current_file.read()
                
                # the following lines clean the book's content for the further analysis
                text = text.replace('\n', ' ').replace('\r', ' ') 
                text = text.lower()    # turn every letter into lower case
                
                # remove the most common symbols, marks, and numbers
                skip_list = [',', '.', ':', ';', '¿', '?', '¡', '!', '#' '"', "'", '-', '(', ')', '{', '}',
                            '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
                
                for ch in skip_list:
                    text = text.replace(ch, '')
                
                # create a temporary dataframe for every book title to store and isolate the stats collected
                temp_df = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])
                
                # this loop will count the frequency for every unique word
                for word in list(filter(None, text.split(' '))):
                    if word in temp_df['words'].values:
                        temp_df.loc[temp_df.words == word, 'freq'] += 1
                    else:
                        temp_df.loc[len(temp_df)] = language, author, title.replace('.txt', ''), word, 1
                
                temp_df = temp_df.sort_values('freq', ascending=False) # sort the dataframe in descending order
                
                # collect the data from the current file before moving to the next one
                process_time = round(time.time() - time_accum - time_start, 2)
                time_accum += process_time
                stat_result.loc[len(stat_result)] = language, author, title.replace('.txt', ''), process_time, len(temp_df), sum(temp_df['freq'].values)
            
            # this will save and accumulate the info collected from the temporary dataframe into a different table
            count_result = pd.concat([count_result, temp_df], axis=0, ignore_index=True)

# output the results as a .csv file
stat_result.to_csv('stat_summary.csv', sep=',')
count_result.to_csv('word_frequencies.csv', sep=',')

print('\n-----------------------------')
print(stat_result)
print('\n-----------------------------')
print(count_result)



-----------------------------
           lang         author                              book_title  \
0       english    shakespeare               A Midsummer Night's Dream   
1       english    shakespeare                                  Hamlet   
2       english    shakespeare                                 Macbeth   
3       english    shakespeare                                 Othello   
4       english    shakespeare                             Richard III   
5       english    shakespeare                        Romeo and Juliet   
6       english    shakespeare                  The Merchant of Venice   
7        french      chevalier                          L'ale de sable   
8        french      chevalier  L'enfer et le paradis de l'autre monde   
9        french      chevalier                            La capitaine   
10       french      chevalier             La fille des indiens rouges   
11       french      chevalier                      La fille du pirate   
12       french      chevalier                        Le chasseur noir   
13       french      chevalier                   Les derniers Iroquois   
14       french  de Maupassant                           Boule de Suif   
15       french  de Maupassant                          Claire de Lune   
16       french  de Maupassant                    Contes de la Becasse   
17       french  de Maupassant   Euvres completes de Guy de Maupassant   
18       french  de Maupassant                         L'inutile beaut   
19       french  de Maupassant                          La Main Gauche   
20       french  de Maupassant                       La Maison Tellier   
21       french  de Maupassant                         La petite roque   
22       french  de Maupassant                                Le Horla   
23       french        diderot                 Ceci n'est pas un conte   
24       french        diderot    Entretien d'un pare avec ses enfants   
25       french        diderot                          L'oiseau blanc   
26       french        diderot              Les deux amis de Bourbonne   
27       french        diderot  Regrets sur ma vieille robe de chambre   
28       french           sand                                    cora   
29       french           sand      Jacques le fataliste et son maatre   
..          ...            ...                                     ...   
72       german    shakespeare                         Romeo und Julia   
73   portuguese         branco                    A Filha do Arcediago   
74   portuguese         branco                     A Neta do Arcediago   
75   portuguese         branco                       A Queda d'um Anjo   
76   portuguese         branco                      Agulha em Palheiro   
77   portuguese         branco                        Amor de Perdicao   
78   portuguese         branco                        Amor de Salvacao   
79   portuguese         branco                          Annos de Prosa   
80   portuguese         branco                          Carlota Angela   
81   portuguese         branco                      Estrellas Funestas   
82   portuguese         branco                     Estrellas Propicias   
83   portuguese         branco                     Lagrimas Abenaoadas   
84   portuguese         branco                     Livro de Consolacao   
85   portuguese         branco                         O Olho de Vidro   
86   portuguese         branco                    O que fazem mulheres   
87   portuguese         branco                              O Regicida   
88   portuguese         branco                   Scenas Contemporaneas   
89   portuguese          dinis             A Morgadinha dos Cannaviaes   
90   portuguese          dinis            Os fidalgos da Casa Mourisca   
91   portuguese          dinis                     Uma familia ingleza   
92   portuguese        Queiros                    A Cidade e as Serras   
93   portuguese        Queiros    A correspondancia de Fradique Mendes   
94   portuguese        Queiros              A Illustre Casa de Ramires   
95   portuguese        Queiros                              A Reliquia   
96   portuguese        Queiros                    Cartas de Inglaterra   
97   portuguese        Queiros                  O crime do padre Amaro   
98   portuguese        Queiros                              O Mandarim   
99   portuguese        Queiros                         O Primo Bazilio   
100  portuguese        Queiros                                Os Maias   
101  portuguese    shakespeare                                  Hamlet   

     process_time uniq_words total_words  
0           81.30       3226       16972  
1          145.04       4794       29575  
2           86.92       3552       17646  
3          131.33       4032       27379  
4          164.85       4705       34665  
5          136.36       4325       28920  
6           99.99       3529       21951  
7          498.27      13420       77237  
8          238.54       7549       44085  
9          281.60       9421       49153  
10         468.99      12677       72098  
11         325.74      11148       52969  
12         317.44       8860       55197  
13         368.96      11370       59296  
14         216.56       8322       39201  
15         142.55       6434       28054  
16         200.75       7948       37309  
17         353.40      11747       57768  
18         222.65       7967       41145  
19         197.98       7431       37089  
20         274.03       9308       47994  
21         241.70       8027       44309  
22         244.25       8129       44560  
23          50.01       3111       11379  
24          51.34       3027       11679  
25         111.02       4992       23468  
26          37.41       2545        8804  
27          21.43       1636        5264  
28          59.10       3742       13224  
29         587.25      12076       95090  
..            ...        ...         ...  
72         111.66       5365       23293  
73         488.96      12740       78149  
74         342.99      10690       58000  
75         333.35      12274       52895  
76         275.29      10048       46938  
77         296.79       9580       51352  
78         322.33      11600       52100  
79         436.01      12960       67263  
80         299.18      10465       50429  
81         305.61      10431       51228  
82         271.82       9644       47073  
83         284.45       9699       49025  
84         369.52      12429       58501  
85         283.07      10423       48083  
86         254.62       9819       43882  
87         335.32      11662       55239  
88         320.58      11097       53202  
89        1200.59      19715      148738  
90        1126.08      17492      144454  
91         969.08      17612      121923  
92         508.40      14453       71227  
93         388.24      13465       56881  
94         834.74      17577      107379  
95         616.67      15747       84947  
96         300.57      11205       48481  
97        1129.89      18832      141629  
98         116.44       6831       22486  
99         900.40      17980      118417  
100       1989.46      24453      215271  
101        183.22       7206       34327  

[102 rows x 6 columns]

-----------------------------
              lang       author                 book_title         words freq
0          english  shakespeare  A Midsummer Night's Dream           the  579
1          english  shakespeare  A Midsummer Night's Dream           and  562
2          english  shakespeare  A Midsummer Night's Dream             i  443
3          english  shakespeare  A Midsummer Night's Dream            to  337
4          english  shakespeare  A Midsummer Night's Dream           you  273
5          english  shakespeare  A Midsummer Night's Dream            of  269
6          english  shakespeare  A Midsummer Night's Dream             a  264
7          english  shakespeare  A Midsummer Night's Dream            in  239
8          english  shakespeare  A Midsummer Night's Dream            my  204
9          english  shakespeare  A Midsummer Night's Dream            is  190
10         english  shakespeare  A Midsummer Night's Dream          that  184
11         english  shakespeare  A Midsummer Night's Dream          with  175
12         english  shakespeare  A Midsummer Night's Dream            me  174
13         english  shakespeare  A Midsummer Night's Dream           not  171
14         english  shakespeare  A Midsummer Night's Dream          this  162
15         english  shakespeare  A Midsummer Night's Dream           her  148
16         english  shakespeare  A Midsummer Night's Dream           for  143
17         english  shakespeare  A Midsummer Night's Dream            it  132
18         english  shakespeare  A Midsummer Night's Dream          your  128
19         english  shakespeare  A Midsummer Night's Dream           but  121
20         english  shakespeare  A Midsummer Night's Dream          thou  118
21         english  shakespeare  A Midsummer Night's Dream            as  115
22         english  shakespeare  A Midsummer Night's Dream            so  113
23         english  shakespeare  A Midsummer Night's Dream          will  111
24         english  shakespeare  A Midsummer Night's Dream          loue  105
25         english  shakespeare  A Midsummer Night's Dream            be  104
26         english  shakespeare  A Midsummer Night's Dream          haue   95
27         english  shakespeare  A Midsummer Night's Dream           his   93
28         english  shakespeare  A Midsummer Night's Dream           all   91
29         english  shakespeare  A Midsummer Night's Dream            no   85
...            ...          ...                        ...           ...  ...
825219  portuguese  shakespeare                     Hamlet     lançarmas    1
825220  portuguese  shakespeare                     Hamlet    arrancarme    1
825221  portuguese  shakespeare                     Hamlet   representem    1
825222  portuguese  shakespeare                     Hamlet  esbofetearme    1
825223  portuguese  shakespeare                     Hamlet     attentado    1
825224  portuguese  shakespeare                     Hamlet       inacção    1
825225  portuguese  shakespeare                     Hamlet          fico    1
825226  portuguese  shakespeare                     Hamlet       confusa    1
825227  portuguese  shakespeare                     Hamlet         tibia    1
825228  portuguese  shakespeare                     Hamlet       ficavam    1
825229  portuguese  shakespeare                     Hamlet   vilipendios    1
825230  portuguese  shakespeare                     Hamlet     possiveis    1
825231  portuguese  shakespeare                     Hamlet   inoffensivo    1
825232  portuguese  shakespeare                     Hamlet           fel    1
825233  portuguese  shakespeare                     Hamlet       trahese    1
825234  portuguese  shakespeare                     Hamlet    espontanea    1
825235  portuguese  shakespeare                     Hamlet   perturbaram    1
825236  portuguese  shakespeare                     Hamlet    dramaticas    1
825237  portuguese  shakespeare                     Hamlet    assistindo    1
825238  portuguese  shakespeare                     Hamlet          eila    1
825239  portuguese  shakespeare                     Hamlet         pausa    1
825240  portuguese  shakespeare                     Hamlet    procuremos    1
825241  portuguese  shakespeare                     Hamlet   imprecações    1
825242  portuguese  shakespeare                     Hamlet           vãs    1
825243  portuguese  shakespeare                     Hamlet        gastar    1
825244  portuguese  shakespeare                     Hamlet      instigam    1
825245  portuguese  shakespeare                     Hamlet     adulterio    1
825246  portuguese  shakespeare                     Hamlet      impudico    1
825247  portuguese  shakespeare                     Hamlet       abutres    1
825248  portuguese  shakespeare                     Hamlet          hear    1

[825249 rows x 5 columns]

VISUALIZE THE RESULTS


In [2]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.figure(figsize = (12,12))

# plot the stat from every book by language
for language in os.listdir(book_dir):
    subset = stat_result[stat_result.lang == language]   # filter the stat data by language
    plt.loglog(subset.total_words, subset.uniq_words, "o", label = language)

plt.legend()
plt.xlabel("Total Number of Words")
plt.ylabel("Number of unique words")
plt.savefig("total_vs_unique_words.png")
plt.show()



In [ ]: