notebook.community

Edit and run



In [1]:

    
from platform import python_version
python_version()









    Out[1]:





'3.6.7'



In [2]:

    
import pandas as pd
import multiprocessing
import nltk
import numpy as np
import sklearn
import re



In [3]:

    
pd.__version__, nltk.__version__, np.__version__, sklearn.__version__









    Out[3]:





('0.25.1', '3.4.5', '1.16.3', '0.21.1')



In [4]:

    
from multiprocessing import Pool



In [5]:

    
from nltk.corpus import brown

generate a dataset that 4 times as big as the brown corpus by generating random permutations



In [6]:

    
def make_texts():
    return [" ".join(np.random.permutation(sents)) for sents in brown.sents()]



In [7]:

    
brown_df = pd.DataFrame({
    'text': make_texts() + make_texts() + make_texts() + make_texts()
})



In [12]:

    
brown_df.sample(3)









    Out[12]:







  
    
      
      text
    
  
  
    
      51747
      a train riding '' I been for . ways now
    
    
      178451
      be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home
    
    
      132362
      room my ? To ?



In [13]:

    
brown_df.shape









    Out[13]:





(229360, 1)



In [14]:



In [15]:

    
def to_lowercase(input_string):
    return input_string.lower()



In [16]:

    
def replace_digits_with_token(input_string):
    return re.sub(r"\b\d+\b","tok_num", input_string)



In [17]:

    
def get_text_length(input_string):
    return len(re.split(r"(?:\s+)|(?:,)|(?:\-)",input_string))



In [24]:

    
def process_df(df):
    
    output_df = df.copy()
    
    # replace weird double quotes with normal ones
    output_df['text']      = output_df['text'].apply(lambda text: text.replace("``",'"'))

    # text to lower case
    output_df['text']      = output_df['text'].apply(lambda text: text.lower())
    
    # replace number with a special token
    output_df['text']      = output_df['text'].apply(lambda text: re.sub(r"\b\d+\b","tok_num", text))
    
    # take out texts that are too large or too small
    output_df['num_words'] = output_df['text'].apply(lambda text: len(re.split(r"(?:\s+)|(?:,)|(?:\-)",text)))   
        
    indices_to_remove_too_large = output_df[output_df['num_words'] > 50]
    output_df.drop(indices_to_remove_too_large.index, inplace=True)
    
    indices_to_remove_too_small = output_df[output_df['num_words'] < 10]
    output_df.drop(indices_to_remove_too_small.index, inplace=True)    
    
    output_df.reset_index(drop=True, inplace=True)
    
    return output_df



In [25]:

    
%%time
processed_df = process_df(brown_df)









    



CPU times: user 2.55 s, sys: 16 ms, total: 2.56 s
Wall time: 2.56 s



In [26]:

    
processed_df.head()









    Out[26]:







  
    
      
      text
      num_words
    
  
  
    
      0
      " investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand
      25
    
    
      1
      of had of . atlanta the , in , city city conducted '' over-all the " deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner
      47
    
    
      2
      been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary " fulton september-october was the
      38
    
    
      3
      '' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received " "
      40
    
    
      4
      it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate " often
      24



In [27]:

    
processed_df.shape









    Out[27]:





(174440, 2)

parallel version



In [28]:

    
NUM_CORES = 8
df_chunks = np.array_split(brown_df,NUM_CORES)



In [29]:

    
%%time

with multiprocessing.Pool(NUM_CORES) as pool:
    processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)









    



CPU times: user 172 ms, sys: 124 ms, total: 296 ms
Wall time: 1.02 s



In [30]:

    
processed_df.shape









    Out[30]:





(174440, 2)

	text
51747	a train riding '' I been for . ways now
178451	be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home
132362	room my ? To ?

	text	num_words
0	" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand	25
1	of had of . atlanta the , in , city city conducted '' over-all the " deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner	47
2	been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary " fulton september-october was the	38
3	'' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received " "	40
4	it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate " often	24