notebook.community

Edit and run



In [27]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML



In [2]:

    
GOLD = "data/processed/gold/"
TEST = "data/processed/input/"



In [3]:

    
dev = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"



In [4]:

    
df = pd.read_csv(train, header=None, sep="\t")



In [23]:

    
display(df.head())









    






  
    
      
      0
      1
      2
    
  
  
    
      0
      628949369883000832
      negative
      dear @Microsoft the newOoffice for Mac is grea...
    
    
      1
      628976607420645377
      negative
      @Microsoft how about you make a system that do...
    
    
      2
      629023169169518592
      negative
      Not Available
    
    
      3
      629179223232479232
      negative
      Not Available
    
    
      4
      629186282179153920
      neutral
      If I make a game as a #windows10 Universal App...



In [9]:

    
print "Data shape: %s, %s" % df.shape
print "Data per label:\n%s" % df[1].value_counts()









    



Data shape: 6000, 3
Data per label:
positive    3094
neutral     2043
negative     863
dtype: int64



In [13]:

    
df_valid = df[df[2] != "Not Available"].copy()



In [14]:

    
print "Valid data shape: %s, %s" % df_valid.shape
print "Valid data per label:\n%s" % df_valid[1].value_counts()









    



Valid data shape: 5366, 3
Valid data per label:
positive    2757
neutral     1844
negative     765
dtype: int64



In [32]:

    
dev = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
for filename in [dev, devtest, train]:
    display(HTML("<h3>Printing details for file: %s</h3>" % filename))
    df = pd.read_csv(filename, header=None, sep="\t")
    display(df.head())
    print "Data shape: %s, %s" % df.shape
    print "Data per label:\n%s" % df[1].value_counts()
    df_valid = df[df[2] != "Not Available"].copy()
    print "Valid data shape: %s, %s" % df_valid.shape
    print "Valid data per label:\n%s" % df_valid[1].value_counts()









    




Printing details for file: data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt






    






  
    
      
      0
      1
      2
    
  
  
    
      0
      638060586258038784
      neutral
      05 Beat it - Michael Jackson - Thriller (25th ...
    
    
      1
      638061181823922176
      positive
      Jay Z joins Instagram with nostalgic tribute t...
    
    
      2
      638083821364244480
      neutral
      Michael Jackson: Bad 25th Anniversary Edition ...
    
    
      3
      638091450132078593
      positive
      Not Available
    
    
      4
      638125563790557184
      positive
      18th anniv of Princess Diana's death. I still ...
    
  








    



Data shape: 2000, 3
Data per label:
positive    844
neutral     765
negative    391
dtype: int64
Valid data shape: 1798, 3
Valid data per label:
positive    763
neutral     682
negative    353
dtype: int64






    




Printing details for file: data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt






    






  
    
      
      0
      1
      2
    
  
  
    
      0
      637641175948763136
      neutral
      Not Available
    
    
      1
      637651487762554881
      neutral
      @PersonaSoda well yeah, that's third parties. ...
    
    
      2
      637666734300905472
      negative
      Not Available
    
    
      3
      637668142110654468
      neutral
      @fakethom Have android tab and don't use phone...
    
    
      4
      637708370129125377
      positive
      Finally I get my ps4 back I sent it to Sony ca...
    
  








    



Data shape: 2000, 3
Data per label:
positive    994
neutral     681
negative    325
dtype: int64
Valid data shape: 1774, 3
Valid data per label:
positive    878
neutral     618
negative    278
dtype: int64






    




Printing details for file: data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt






    






  
    
      
      0
      1
      2
    
  
  
    
      0
      628949369883000832
      negative
      dear @Microsoft the newOoffice for Mac is grea...
    
    
      1
      628976607420645377
      negative
      @Microsoft how about you make a system that do...
    
    
      2
      629023169169518592
      negative
      Not Available
    
    
      3
      629179223232479232
      negative
      Not Available
    
    
      4
      629186282179153920
      neutral
      If I make a game as a #windows10 Universal App...
    
  








    



Data shape: 6000, 3
Data per label:
positive    3094
neutral     2043
negative     863
dtype: int64
Valid data shape: 5366, 3
Valid data per label:
positive    2757
neutral     1844
negative     765
dtype: int64



In [29]:

    
dev = "data/processed/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt"
for filename in [dev, devtest, train]:
    display(HTML("<h3>Printing details for file: %s</h3>" % filename))
    df = pd.read_csv(filename, header=None, sep="\t")
    display(df.head())
    print "Data shape: %s, %s" % df.shape
    print "Data per label:\n%s" % df[2].value_counts()
    df_valid = df[df[3] != "Not Available"].copy()
    print "Valid data shape: %s, %s" % df_valid.shape
    print "Valid data per label:\n%s" % df_valid[2].value_counts()









    




Printing details for file: data/processed/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt






    






  
    
      
      0
      1
      2
      3
    
  
  
    
      0
      638061181823922176
      michael jackson
      positive
      Jay Z joins Instagram with nostalgic tribute t...
    
    
      1
      638091450132078593
      michael jackson
      positive
      Not Available
    
    
      2
      638125563790557184
      michael jackson
      positive
      18th anniv of Princess Diana's death. I still ...
    
    
      3
      638130776727535617
      michael jackson
      positive
      @oridaganjazz The 1st time I heard Michael Jac...
    
    
      4
      638134980862828544
      michael jackson
      positive
      'Michael Jackson' appeared on Saturday 29 at t...
    
  








    



Data shape: 1325, 4
Data per label:
positive    986
negative    339
dtype: int64
Valid data shape: 1193, 4
Valid data per label:
positive    881
negative    312
dtype: int64






    




Printing details for file: data/processed/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt






    






  
    
      
      0
      1
      2
      3
    
  
  
    
      0
      637641175948763136
      sony
      positive
      Not Available
    
    
      1
      637666734300905472
      sony
      negative
      Not Available
    
    
      2
      637668142110654468
      sony
      positive
      @fakethom Have android tab and don't use phone...
    
    
      3
      637708370129125377
      sony
      positive
      Finally I get my ps4 back I sent it to Sony ca...
    
    
      4
      637807521500020737
      sony
      negative
      Not Available
    
  








    



Data shape: 1417, 4
Data per label:
positive    1153
negative     264
dtype: int64
Valid data shape: 1241, 4
Valid data per label:
positive    1007
negative     234
dtype: int64






    




Printing details for file: data/processed/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt






    






  
    
      
      0
      1
      2
      3
    
  
  
    
      0
      628949369883000832
      @microsoft
      negative
      dear @Microsoft the newOoffice for Mac is grea...
    
    
      1
      628976607420645377
      @microsoft
      negative
      @Microsoft how about you make a system that do...
    
    
      2
      629023169169518592
      @microsoft
      negative
      Not Available
    
    
      3
      629179223232479232
      @microsoft
      negative
      Not Available
    
    
      4
      629226490152914944
      @microsoft
      positive
      Microsoft, I may not prefer your gaming branch...
    
  








    



Data shape: 4346, 4
Data per label:
positive    3591
negative     755
dtype: int64
Valid data shape: 3872, 4
Valid data per label:
positive    3205
negative     667
dtype: int64



In [31]:

    
dev = "data/processed/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt"
for filename in [dev, devtest, train]:
    display(HTML("<h3>Printing details for file: %s</h3>" % filename))
    df = pd.read_csv(filename, header=None, sep="\t")
    display(df.head())
    print "Data shape: %s, %s" % df.shape
    print "Data per label:\n%s" % df[2].value_counts()
    df_valid = df[df[3] != "Not Available"].copy()
    print "Valid data shape: %s, %s" % df_valid.shape
    print "Valid data per label:\n%s" % df_valid[2].value_counts()









    




Printing details for file: data/processed/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt






    






  
    
      
      0
      1
      2
      3
    
  
  
    
      0
      638060586258038784
      michael jackson
      0
      05 Beat it - Michael Jackson - Thriller (25th ...
    
    
      1
      638061181823922176
      michael jackson
      1
      Jay Z joins Instagram with nostalgic tribute t...
    
    
      2
      638083821364244480
      michael jackson
      0
      Michael Jackson: Bad 25th Anniversary Edition ...
    
    
      3
      638091450132078593
      michael jackson
      1
      Not Available
    
    
      4
      638125563790557184
      michael jackson
      1
      18th anniv of Princess Diana's death. I still ...
    
  








    



Data shape: 2000, 4
Data per label:
 1    933
 0    675
-1    296
 2     53
-2     43
dtype: int64
Valid data shape: 1798, 4
Valid data per label:
 1    837
 0    605
-1    275
 2     44
-2     37
dtype: int64






    




Printing details for file: data/processed/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt






    






  
    
      
      0
      1
      2
      3
    
  
  
    
      0
      637641175948763136
      sony
      1
      Not Available
    
    
      1
      637651487762554881
      sony
      0
      @PersonaSoda well yeah, that's third parties. ...
    
    
      2
      637666734300905472
      sony
      -1
      Not Available
    
    
      3
      637668142110654468
      sony
      1
      @fakethom Have android tab and don't use phone...
    
    
      4
      637708370129125377
      sony
      1
      Finally I get my ps4 back I sent it to Sony ca...
    
  








    



Data shape: 2000, 4
Data per label:
 1    1005
 0     583
-1     233
 2     148
-2      31
dtype: int64
Valid data shape: 1774, 4
Valid data per label:
 1    882
 0    533
-1    205
 2    125
-2     29
dtype: int64






    




Printing details for file: data/processed/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt






    






  
    
      
      0
      1
      2
      3
    
  
  
    
      0
      628949369883000832
      @microsoft
      -1
      dear @Microsoft the newOoffice for Mac is grea...
    
    
      1
      628976607420645377
      @microsoft
      -2
      @Microsoft how about you make a system that do...
    
    
      2
      629023169169518592
      @microsoft
      -1
      Not Available
    
    
      3
      629179223232479232
      @microsoft
      -1
      Not Available
    
    
      4
      629186282179153920
      @microsoft
      0
      If I make a game as a #windows10 Universal App...
    
  








    



Data shape: 6000, 4
Data per label:
 1    3154
 0    1654
-1     668
 2     437
-2      87
dtype: int64
Valid data shape: 5366, 4
Valid data per label:
 1    2827
 0    1494
-1     597
 2     378
-2      70
dtype: int64



In [ ]:

	0	1	2
0	628949369883000832	negative	dear @Microsoft the newOoffice for Mac is grea...
1	628976607420645377	negative	@Microsoft how about you make a system that do...
2	629023169169518592	negative	Not Available
3	629179223232479232	negative	Not Available
4	629186282179153920	neutral	If I make a game as a #windows10 Universal App...

	0	1	2
0	638060586258038784	neutral	05 Beat it - Michael Jackson - Thriller (25th ...
1	638061181823922176	positive	Jay Z joins Instagram with nostalgic tribute t...
2	638083821364244480	neutral	Michael Jackson: Bad 25th Anniversary Edition ...
3	638091450132078593	positive	Not Available
4	638125563790557184	positive	18th anniv of Princess Diana's death. I still ...

	0	1	2
0	637641175948763136	neutral	Not Available
1	637651487762554881	neutral	@PersonaSoda well yeah, that's third parties. ...
2	637666734300905472	negative	Not Available
3	637668142110654468	neutral	@fakethom Have android tab and don't use phone...
4	637708370129125377	positive	Finally I get my ps4 back I sent it to Sony ca...

	0	1	2	3
0	638061181823922176	michael jackson	positive	Jay Z joins Instagram with nostalgic tribute t...
1	638091450132078593	michael jackson	positive	Not Available
2	638125563790557184	michael jackson	positive	18th anniv of Princess Diana's death. I still ...
3	638130776727535617	michael jackson	positive	@oridaganjazz The 1st time I heard Michael Jac...
4	638134980862828544	michael jackson	positive	'Michael Jackson' appeared on Saturday 29 at t...

	0	1	2	3
0	637641175948763136	sony	positive	Not Available
1	637666734300905472	sony	negative	Not Available
2	637668142110654468	sony	positive	@fakethom Have android tab and don't use phone...
3	637708370129125377	sony	positive	Finally I get my ps4 back I sent it to Sony ca...
4	637807521500020737	sony	negative	Not Available

	0	1	2	3
0	628949369883000832	@microsoft	negative	dear @Microsoft the newOoffice for Mac is grea...
1	628976607420645377	@microsoft	negative	@Microsoft how about you make a system that do...
2	629023169169518592	@microsoft	negative	Not Available
3	629179223232479232	@microsoft	negative	Not Available
4	629226490152914944	@microsoft	positive	Microsoft, I may not prefer your gaming branch...