In [1]:
import pandas as pd
import re
In [2]:
def recordReader(path_file):
text = None
try:
text = open(path_file, encoding='utf-8').read()
return text
except (TypeError, ZeroDivisionError, ValueError):
print("Error: "+e)
return text
In [3]:
text = recordReader("./data/mercado_ti.txt")
text2 = recordReader("./data/data_science.txt")
In [4]:
def input_split(text):
text = re.sub(u'[^a-zA-ZáéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', text)
return pd.DataFrame(text.split(" "))
In [5]:
df_file1 = input_split(text)
df_file2 = input_split(text2)
In [6]:
def mapper(df):
df.rename(columns={0:"key"}, inplace=True)
df.at[:,'value'] = [1]
return df
In [7]:
df1 = mapper(df_file1)
df2 = mapper(df_file2)
In [8]:
df1.head()
Out[8]:
In [9]:
df2.head()
Out[9]:
In [10]:
def combiner(df_list):
return pd.concat(df_list)
In [11]:
df_combiner = combiner([df_file1, df_file2])
In [12]:
df_combiner.head()
Out[12]:
In [13]:
def shufle(df):
df.sort_values(by=['key'], inplace=True)
df = df[df.key != ""]
return df
In [14]:
df_combiner = shufle(df_combiner)
In [15]:
df_combiner.head()
Out[15]:
In [16]:
def reducer(df):
return df_combiner.groupby('key').count()
In [17]:
df_reducer = reducer(df_combiner)
df_reducer.head(10)
Out[17]:
In [18]:
df_reducer.sort_values(by=['value'], ascending=False, inplace=True)
In [20]:
df_reducer.head(20)
Out[20]: