Why need fixing

Many times the files are not parallel. The noDuplist file and the processed files have tweets in different order. Fix that here.


In [1]:
import pandas as pd
import re

In [2]:
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/')


Out[2]:
['malkanen']

In [3]:
dirname="SkinDamage/SkinDamage"
df = pd.read_csv(
    "%s_processed.old.csv" % dirname
).dropna(
    subset=["GUID", "Date", "Author"]
).drop_duplicates(subset=["Date", "Author"])

df_orig = pd.read_csv("%s_noDublict.old.csv" % dirname).rename(
        columns={"Date (CST)": "Date"}
    ).assign(
        Author=lambda x: x.URL.apply(
            lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0]
        )
    ).drop_duplicates(subset=["Date", "Author"])
df.shape, df_orig.shape


Out[3]:
((14128, 26), (14128, 17))

In [4]:
%%time
df = df.assign(date_sorted=pd.to_datetime(df.Date)).sort_values(
    ["date_sorted", "Author"], ascending=False)
df_orig = df_orig.assign(date_sorted=pd.to_datetime(df_orig.Date)).sort_values(
    ["date_sorted", "Author"], ascending=False)


CPU times: user 10 s, sys: 28 ms, total: 10 s
Wall time: 10 s

In [5]:
df.processedPost.head().values


Out[5]:
array([' like a reverse day walker only a red beard but still gets sun damage',
       ' health benefits of the sun sun damage skin care dailybeauty the beauty authority newbeauty melanoma',
       ' cyber monday tis the season to push back sun damage laser treatments such as fraxel cutera',
       ' shield your furniture from dust water pollen sap sun damage and other outdoor elements with this sorora usa single sofa cover the ad',
       ' oh my god they can sell us and sunscreen and skin cancer drugs and biotower condos when the sun gets too intense'], dtype=object)

In [6]:
df_orig.Contents.head().values


Out[6]:
array([ '@ryansouth21 @joshelliott82 @jonnaofarc like a reverse Day Walker, only a red beard but still gets sun damage.',
       'Health Benefits of the Sun - Sun Damage - Skin Care - DailyBeauty - The Beauty Authority - NewBeauty #melanoma https://t.co/IamTksKpdT',
       'Cyber Monday : Tis The Season To Push Back Sun Damage - Laser treatments such as Fraxel, Cutera? https://t.co/Lt8S2nPVqx',
       'Shield your furniture from dust, water, pollen, sap, sun damage and other outdoor elements with this Sorora USA Single Sofa Cover. The ad...',
       '@ComposerBrad oh my God. They can sell us  and sunscreen and skin cancer drugs and biotower condos when the sun gets too intense. ??'], dtype=object)

In [7]:
df[["GUID", "Author", "Date"]].head()


Out[7]:
GUID Author Date
6231 8.030000e+17 @AyyMistaCarter 11/28/16 22:52
3976 8.030000e+17 @BOSTONSKIN 11/28/16 22:04
1805 8.030000e+17 @BarbaraPersons 11/28/16 21:50
9436 8.030000e+17 @Overstock 11/28/16 20:48
7574 8.030000e+17 @KyrieElissa 11/28/16 20:48

In [8]:
df_orig[["GUID", "Author", "Date"]].head()


Out[8]:
GUID Author Date
2049 8.030000e+17 @AyyMistaCarter 11/28/16 22:52
997 8.030000e+17 @BOSTONSKIN 11/28/16 22:04
2385 8.030000e+17 @BarbaraPersons 11/28/16 21:50
118 8.030000e+17 @Overstock 11/28/16 20:48
6556 8.030000e+17 @KyrieElissa 11/28/16 20:48

In [9]:
df = df.drop(["date_sorted",], axis=1)
df_orig = df_orig.rename(columns={"Date": "Date (CST)"}
              ).drop(["date_sorted","Author"], axis=1)

In [10]:
%%time
df.drop("URL", axis=1).to_csv("%s_processed.csv" % dirname, index=False)
df_orig.to_csv("%s_noDublict.csv" % dirname, index=False)


CPU times: user 268 ms, sys: 12 ms, total: 280 ms
Wall time: 277 ms

In [11]:
print dirname
df = pd.read_csv("%s_processed.csv" % dirname)
df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
print df_orig.shape, df.shape
assert df_orig.shape[0] == df.shape[0]
df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
print df_merged.shape
assert df_merged.shape[0] == df.shape[0]
assert (df_merged.Author != df_merged.URL.apply(lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0])).sum() == 0


SkinDamage/SkinDamage
(14128, 16) (14128, 25)
(14128, 27)

In [12]:
(df_merged.Author != df_merged.URL.apply(lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0])).sum()


Out[12]:
0

In [ ]: