In [1]:
from platform import python_version
python_version()
Out[1]:
In [2]:
import pandas as pd
import multiprocessing
import nltk
import numpy as np
import sklearn
import re
In [3]:
pd.__version__, nltk.__version__, np.__version__, sklearn.__version__
Out[3]:
In [4]:
from multiprocessing import Pool
In [5]:
from nltk.corpus import brown
In [6]:
def make_texts():
return [" ".join(np.random.permutation(sents)) for sents in brown.sents()]
In [7]:
brown_df = pd.DataFrame({
'text': make_texts() + make_texts() + make_texts() + make_texts()
})
In [12]:
brown_df.sample(3)
Out[12]:
In [13]:
brown_df.shape
Out[13]:
In [14]:
In [15]:
def to_lowercase(input_string):
return input_string.lower()
In [16]:
def replace_digits_with_token(input_string):
return re.sub(r"\b\d+\b","tok_num", input_string)
In [17]:
def get_text_length(input_string):
return len(re.split(r"(?:\s+)|(?:,)|(?:\-)",input_string))
In [24]:
def process_df(df):
output_df = df.copy()
# replace weird double quotes with normal ones
output_df['text'] = output_df['text'].apply(lambda text: text.replace("``",'"'))
# text to lower case
output_df['text'] = output_df['text'].apply(lambda text: text.lower())
# replace number with a special token
output_df['text'] = output_df['text'].apply(lambda text: re.sub(r"\b\d+\b","tok_num", text))
# take out texts that are too large or too small
output_df['num_words'] = output_df['text'].apply(lambda text: len(re.split(r"(?:\s+)|(?:,)|(?:\-)",text)))
indices_to_remove_too_large = output_df[output_df['num_words'] > 50]
output_df.drop(indices_to_remove_too_large.index, inplace=True)
indices_to_remove_too_small = output_df[output_df['num_words'] < 10]
output_df.drop(indices_to_remove_too_small.index, inplace=True)
output_df.reset_index(drop=True, inplace=True)
return output_df
In [25]:
%%time
processed_df = process_df(brown_df)
In [26]:
processed_df.head()
Out[26]:
In [27]:
processed_df.shape
Out[27]:
In [28]:
NUM_CORES = 8
df_chunks = np.array_split(brown_df,NUM_CORES)
In [29]:
%%time
with multiprocessing.Pool(NUM_CORES) as pool:
processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)
In [30]:
processed_df.shape
Out[30]: