In [1]:
import pandas as pd
import re
In [2]:
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/')
Out[2]:
In [3]:
dirname="SkinDamage/SkinDamage"
df = pd.read_csv(
"%s_processed.old.csv" % dirname
).dropna(
subset=["GUID", "Date", "Author"]
).drop_duplicates(subset=["Date", "Author"])
df_orig = pd.read_csv("%s_noDublict.old.csv" % dirname).rename(
columns={"Date (CST)": "Date"}
).assign(
Author=lambda x: x.URL.apply(
lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0]
)
).drop_duplicates(subset=["Date", "Author"])
df.shape, df_orig.shape
Out[3]:
In [4]:
%%time
df = df.assign(date_sorted=pd.to_datetime(df.Date)).sort_values(
["date_sorted", "Author"], ascending=False)
df_orig = df_orig.assign(date_sorted=pd.to_datetime(df_orig.Date)).sort_values(
["date_sorted", "Author"], ascending=False)
In [5]:
df.processedPost.head().values
Out[5]:
In [6]:
df_orig.Contents.head().values
Out[6]:
In [7]:
df[["GUID", "Author", "Date"]].head()
Out[7]:
In [8]:
df_orig[["GUID", "Author", "Date"]].head()
Out[8]:
In [9]:
df = df.drop(["date_sorted",], axis=1)
df_orig = df_orig.rename(columns={"Date": "Date (CST)"}
).drop(["date_sorted","Author"], axis=1)
In [10]:
%%time
df.drop("URL", axis=1).to_csv("%s_processed.csv" % dirname, index=False)
df_orig.to_csv("%s_noDublict.csv" % dirname, index=False)
In [11]:
print dirname
df = pd.read_csv("%s_processed.csv" % dirname)
df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
print df_orig.shape, df.shape
assert df_orig.shape[0] == df.shape[0]
df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
print df_merged.shape
assert df_merged.shape[0] == df.shape[0]
assert (df_merged.Author != df_merged.URL.apply(lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0])).sum() == 0
In [12]:
(df_merged.Author != df_merged.URL.apply(lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0])).sum()
Out[12]:
In [ ]: