Why need fixing

Many times the files are not parallel. The noDuplist file and the processed files have tweets in different order. Fix that here.



In [1]:

    
import pandas as pd
import re



In [2]:

    
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/')









    Out[2]:





['malkanen']



In [3]:

    
dirname="SkinDamage/SkinDamage"
df = pd.read_csv(
    "%s_processed.old.csv" % dirname
).dropna(
    subset=["GUID", "Date", "Author"]
).drop_duplicates(subset=["Date", "Author"])

df_orig = pd.read_csv("%s_noDublict.old.csv" % dirname).rename(
        columns={"Date (CST)": "Date"}
    ).assign(
        Author=lambda x: x.URL.apply(
            lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0]
        )
    ).drop_duplicates(subset=["Date", "Author"])
df.shape, df_orig.shape









    Out[3]:





((14128, 26), (14128, 17))



In [4]:

    
%%time
df = df.assign(date_sorted=pd.to_datetime(df.Date)).sort_values(
    ["date_sorted", "Author"], ascending=False)
df_orig = df_orig.assign(date_sorted=pd.to_datetime(df_orig.Date)).sort_values(
    ["date_sorted", "Author"], ascending=False)









    



CPU times: user 10 s, sys: 28 ms, total: 10 s
Wall time: 10 s



In [5]:

    
df.processedPost.head().values









    Out[5]:





array([' like a reverse day walker only a red beard but still gets sun damage',
       ' health benefits of the sun sun damage skin care dailybeauty the beauty authority newbeauty melanoma',
       ' cyber monday tis the season to push back sun damage laser treatments such as fraxel cutera',
       ' shield your furniture from dust water pollen sap sun damage and other outdoor elements with this sorora usa single sofa cover the ad',
       ' oh my god they can sell us and sunscreen and skin cancer drugs and biotower condos when the sun gets too intense'], dtype=object)



In [6]:

    
df_orig.Contents.head().values









    Out[6]:





array([ '@ryansouth21 @joshelliott82 @jonnaofarc like a reverse Day Walker, only a red beard but still gets sun damage.',
       'Health Benefits of the Sun - Sun Damage - Skin Care - DailyBeauty - The Beauty Authority - NewBeauty #melanoma https://t.co/IamTksKpdT',
       'Cyber Monday : Tis The Season To Push Back Sun Damage - Laser treatments such as Fraxel, Cutera? https://t.co/Lt8S2nPVqx',
       'Shield your furniture from dust, water, pollen, sap, sun damage and other outdoor elements with this Sorora USA Single Sofa Cover. The ad...',
       '@ComposerBrad oh my God. They can sell us  and sunscreen and skin cancer drugs and biotower condos when the sun gets too intense. ??'], dtype=object)



In [7]:

    
df[["GUID", "Author", "Date"]].head()









    Out[7]:






  
    
      
      GUID
      Author
      Date
    
  
  
    
      6231
      8.030000e+17
      @AyyMistaCarter
      11/28/16 22:52
    
    
      3976
      8.030000e+17
      @BOSTONSKIN
      11/28/16 22:04
    
    
      1805
      8.030000e+17
      @BarbaraPersons
      11/28/16 21:50
    
    
      9436
      8.030000e+17
      @Overstock
      11/28/16 20:48
    
    
      7574
      8.030000e+17
      @KyrieElissa
      11/28/16 20:48



In [8]:

    
df_orig[["GUID", "Author", "Date"]].head()









    Out[8]:






  
    
      
      GUID
      Author
      Date
    
  
  
    
      2049
      8.030000e+17
      @AyyMistaCarter
      11/28/16 22:52
    
    
      997
      8.030000e+17
      @BOSTONSKIN
      11/28/16 22:04
    
    
      2385
      8.030000e+17
      @BarbaraPersons
      11/28/16 21:50
    
    
      118
      8.030000e+17
      @Overstock
      11/28/16 20:48
    
    
      6556
      8.030000e+17
      @KyrieElissa
      11/28/16 20:48



In [9]:

    
df = df.drop(["date_sorted",], axis=1)
df_orig = df_orig.rename(columns={"Date": "Date (CST)"}
              ).drop(["date_sorted","Author"], axis=1)



In [10]:

    
%%time
df.drop("URL", axis=1).to_csv("%s_processed.csv" % dirname, index=False)
df_orig.to_csv("%s_noDublict.csv" % dirname, index=False)









    



CPU times: user 268 ms, sys: 12 ms, total: 280 ms
Wall time: 277 ms



In [11]:

    
print dirname
df = pd.read_csv("%s_processed.csv" % dirname)
df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
print df_orig.shape, df.shape
assert df_orig.shape[0] == df.shape[0]
df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
print df_merged.shape
assert df_merged.shape[0] == df.shape[0]
assert (df_merged.Author != df_merged.URL.apply(lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0])).sum() == 0









    



SkinDamage/SkinDamage
(14128, 16) (14128, 25)
(14128, 27)



In [12]:

    
(df_merged.Author != df_merged.URL.apply(lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0])).sum()









    Out[12]:





0



In [ ]:

	GUID	Author	Date
6231	8.030000e+17	@AyyMistaCarter	11/28/16 22:52
3976	8.030000e+17	@BOSTONSKIN	11/28/16 22:04
1805	8.030000e+17	@BarbaraPersons	11/28/16 21:50
9436	8.030000e+17	@Overstock	11/28/16 20:48
7574	8.030000e+17	@KyrieElissa	11/28/16 20:48

	GUID	Author	Date
2049	8.030000e+17	@AyyMistaCarter	11/28/16 22:52
997	8.030000e+17	@BOSTONSKIN	11/28/16 22:04
2385	8.030000e+17	@BarbaraPersons	11/28/16 21:50
118	8.030000e+17	@Overstock	11/28/16 20:48
6556	8.030000e+17	@KyrieElissa	11/28/16 20:48