In [1]:
from platform import python_version
python_version()


Out[1]:
'3.6.7'

In [2]:
import pandas as pd
import multiprocessing
import nltk
import numpy as np
import sklearn
import re

In [3]:
pd.__version__, nltk.__version__, np.__version__, sklearn.__version__


Out[3]:
('0.25.1', '3.4.5', '1.16.3', '0.21.1')

In [4]:
from multiprocessing import Pool

In [5]:
from nltk.corpus import brown

generate a dataset that 4 times as big as the brown corpus by generating random permutations


In [6]:
def make_texts():
    return [" ".join(np.random.permutation(sents)) for sents in brown.sents()]

In [7]:
brown_df = pd.DataFrame({
    'text': make_texts() + make_texts() + make_texts() + make_texts()
})

In [12]:
brown_df.sample(3)


Out[12]:
text
51747 a train riding '' I been for . ways now
178451 be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home
132362 room my ? To ?

In [13]:
brown_df.shape


Out[13]:
(229360, 1)

In [14]:


In [15]:
def to_lowercase(input_string):
    return input_string.lower()

In [16]:
def replace_digits_with_token(input_string):
    return re.sub(r"\b\d+\b","tok_num", input_string)

In [17]:
def get_text_length(input_string):
    return len(re.split(r"(?:\s+)|(?:,)|(?:\-)",input_string))

In [24]:
def process_df(df):
    
    output_df = df.copy()
    
    # replace weird double quotes with normal ones
    output_df['text']      = output_df['text'].apply(lambda text: text.replace("``",'"'))

    # text to lower case
    output_df['text']      = output_df['text'].apply(lambda text: text.lower())
    
    # replace number with a special token
    output_df['text']      = output_df['text'].apply(lambda text: re.sub(r"\b\d+\b","tok_num", text))
    
    # take out texts that are too large or too small
    output_df['num_words'] = output_df['text'].apply(lambda text: len(re.split(r"(?:\s+)|(?:,)|(?:\-)",text)))   
        
    indices_to_remove_too_large = output_df[output_df['num_words'] > 50]
    output_df.drop(indices_to_remove_too_large.index, inplace=True)
    
    indices_to_remove_too_small = output_df[output_df['num_words'] < 10]
    output_df.drop(indices_to_remove_too_small.index, inplace=True)    
    
    output_df.reset_index(drop=True, inplace=True)
    
    return output_df

In [25]:
%%time
processed_df = process_df(brown_df)


CPU times: user 2.55 s, sys: 16 ms, total: 2.56 s
Wall time: 2.56 s

In [26]:
processed_df.head()


Out[26]:
text num_words
0 " investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand 25
1 of had of . atlanta the , in , city city conducted '' over-all the " deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner 47
2 been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary " fulton september-october was the 38
3 '' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received " " 40
4 it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate " often 24

In [27]:
processed_df.shape


Out[27]:
(174440, 2)

parallel version


In [28]:
NUM_CORES = 8
df_chunks = np.array_split(brown_df,NUM_CORES)

In [29]:
%%time

with multiprocessing.Pool(NUM_CORES) as pool:
    processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)


CPU times: user 172 ms, sys: 124 ms, total: 296 ms
Wall time: 1.02 s

In [30]:
processed_df.shape


Out[30]:
(174440, 2)