In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML

In [2]:
GOLD = "data/processed/gold/"
TEST = "data/processed/input/"

In [3]:
dev = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"

In [4]:
df = pd.read_csv(train, header=None, sep="\t")

In [23]:
display(df.head())


0 1 2
0 628949369883000832 negative dear @Microsoft the newOoffice for Mac is grea...
1 628976607420645377 negative @Microsoft how about you make a system that do...
2 629023169169518592 negative Not Available
3 629179223232479232 negative Not Available
4 629186282179153920 neutral If I make a game as a #windows10 Universal App...

In [9]:
print "Data shape: %s, %s" % df.shape
print "Data per label:\n%s" % df[1].value_counts()


Data shape: 6000, 3
Data per label:
positive    3094
neutral     2043
negative     863
dtype: int64

In [13]:
df_valid = df[df[2] != "Not Available"].copy()

In [14]:
print "Valid data shape: %s, %s" % df_valid.shape
print "Valid data per label:\n%s" % df_valid[1].value_counts()


Valid data shape: 5366, 3
Valid data per label:
positive    2757
neutral     1844
negative     765
dtype: int64

In [32]:
dev = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
for filename in [dev, devtest, train]:
    display(HTML("<h3>Printing details for file: %s</h3>" % filename))
    df = pd.read_csv(filename, header=None, sep="\t")
    display(df.head())
    print "Data shape: %s, %s" % df.shape
    print "Data per label:\n%s" % df[1].value_counts()
    df_valid = df[df[2] != "Not Available"].copy()
    print "Valid data shape: %s, %s" % df_valid.shape
    print "Valid data per label:\n%s" % df_valid[1].value_counts()


Printing details for file: data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt

0 1 2
0 638060586258038784 neutral 05 Beat it - Michael Jackson - Thriller (25th ...
1 638061181823922176 positive Jay Z joins Instagram with nostalgic tribute t...
2 638083821364244480 neutral Michael Jackson: Bad 25th Anniversary Edition ...
3 638091450132078593 positive Not Available
4 638125563790557184 positive 18th anniv of Princess Diana's death. I still ...
Data shape: 2000, 3
Data per label:
positive    844
neutral     765
negative    391
dtype: int64
Valid data shape: 1798, 3
Valid data per label:
positive    763
neutral     682
negative    353
dtype: int64

Printing details for file: data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt

0 1 2
0 637641175948763136 neutral Not Available
1 637651487762554881 neutral @PersonaSoda well yeah, that's third parties. ...
2 637666734300905472 negative Not Available
3 637668142110654468 neutral @fakethom Have android tab and don't use phone...
4 637708370129125377 positive Finally I get my ps4 back I sent it to Sony ca...
Data shape: 2000, 3
Data per label:
positive    994
neutral     681
negative    325
dtype: int64
Valid data shape: 1774, 3
Valid data per label:
positive    878
neutral     618
negative    278
dtype: int64

Printing details for file: data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt

0 1 2
0 628949369883000832 negative dear @Microsoft the newOoffice for Mac is grea...
1 628976607420645377 negative @Microsoft how about you make a system that do...
2 629023169169518592 negative Not Available
3 629179223232479232 negative Not Available
4 629186282179153920 neutral If I make a game as a #windows10 Universal App...
Data shape: 6000, 3
Data per label:
positive    3094
neutral     2043
negative     863
dtype: int64
Valid data shape: 5366, 3
Valid data per label:
positive    2757
neutral     1844
negative     765
dtype: int64

In [29]:
dev = "data/processed/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt"
for filename in [dev, devtest, train]:
    display(HTML("<h3>Printing details for file: %s</h3>" % filename))
    df = pd.read_csv(filename, header=None, sep="\t")
    display(df.head())
    print "Data shape: %s, %s" % df.shape
    print "Data per label:\n%s" % df[2].value_counts()
    df_valid = df[df[3] != "Not Available"].copy()
    print "Valid data shape: %s, %s" % df_valid.shape
    print "Valid data per label:\n%s" % df_valid[2].value_counts()


Printing details for file: data/processed/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt

0 1 2 3
0 638061181823922176 michael jackson positive Jay Z joins Instagram with nostalgic tribute t...
1 638091450132078593 michael jackson positive Not Available
2 638125563790557184 michael jackson positive 18th anniv of Princess Diana's death. I still ...
3 638130776727535617 michael jackson positive @oridaganjazz The 1st time I heard Michael Jac...
4 638134980862828544 michael jackson positive 'Michael Jackson' appeared on Saturday 29 at t...
Data shape: 1325, 4
Data per label:
positive    986
negative    339
dtype: int64
Valid data shape: 1193, 4
Valid data per label:
positive    881
negative    312
dtype: int64

Printing details for file: data/processed/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt

0 1 2 3
0 637641175948763136 sony positive Not Available
1 637666734300905472 sony negative Not Available
2 637668142110654468 sony positive @fakethom Have android tab and don't use phone...
3 637708370129125377 sony positive Finally I get my ps4 back I sent it to Sony ca...
4 637807521500020737 sony negative Not Available
Data shape: 1417, 4
Data per label:
positive    1153
negative     264
dtype: int64
Valid data shape: 1241, 4
Valid data per label:
positive    1007
negative     234
dtype: int64

Printing details for file: data/processed/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt

0 1 2 3
0 628949369883000832 @microsoft negative dear @Microsoft the newOoffice for Mac is grea...
1 628976607420645377 @microsoft negative @Microsoft how about you make a system that do...
2 629023169169518592 @microsoft negative Not Available
3 629179223232479232 @microsoft negative Not Available
4 629226490152914944 @microsoft positive Microsoft, I may not prefer your gaming branch...
Data shape: 4346, 4
Data per label:
positive    3591
negative     755
dtype: int64
Valid data shape: 3872, 4
Valid data per label:
positive    3205
negative     667
dtype: int64

In [31]:
dev = "data/processed/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt"
for filename in [dev, devtest, train]:
    display(HTML("<h3>Printing details for file: %s</h3>" % filename))
    df = pd.read_csv(filename, header=None, sep="\t")
    display(df.head())
    print "Data shape: %s, %s" % df.shape
    print "Data per label:\n%s" % df[2].value_counts()
    df_valid = df[df[3] != "Not Available"].copy()
    print "Valid data shape: %s, %s" % df_valid.shape
    print "Valid data per label:\n%s" % df_valid[2].value_counts()


Printing details for file: data/processed/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt

0 1 2 3
0 638060586258038784 michael jackson 0 05 Beat it - Michael Jackson - Thriller (25th ...
1 638061181823922176 michael jackson 1 Jay Z joins Instagram with nostalgic tribute t...
2 638083821364244480 michael jackson 0 Michael Jackson: Bad 25th Anniversary Edition ...
3 638091450132078593 michael jackson 1 Not Available
4 638125563790557184 michael jackson 1 18th anniv of Princess Diana's death. I still ...
Data shape: 2000, 4
Data per label:
 1    933
 0    675
-1    296
 2     53
-2     43
dtype: int64
Valid data shape: 1798, 4
Valid data per label:
 1    837
 0    605
-1    275
 2     44
-2     37
dtype: int64

Printing details for file: data/processed/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt

0 1 2 3
0 637641175948763136 sony 1 Not Available
1 637651487762554881 sony 0 @PersonaSoda well yeah, that's third parties. ...
2 637666734300905472 sony -1 Not Available
3 637668142110654468 sony 1 @fakethom Have android tab and don't use phone...
4 637708370129125377 sony 1 Finally I get my ps4 back I sent it to Sony ca...
Data shape: 2000, 4
Data per label:
 1    1005
 0     583
-1     233
 2     148
-2      31
dtype: int64
Valid data shape: 1774, 4
Valid data per label:
 1    882
 0    533
-1    205
 2    125
-2     29
dtype: int64

Printing details for file: data/processed/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt

0 1 2 3
0 628949369883000832 @microsoft -1 dear @Microsoft the newOoffice for Mac is grea...
1 628976607420645377 @microsoft -2 @Microsoft how about you make a system that do...
2 629023169169518592 @microsoft -1 Not Available
3 629179223232479232 @microsoft -1 Not Available
4 629186282179153920 @microsoft 0 If I make a game as a #windows10 Universal App...
Data shape: 6000, 4
Data per label:
 1    3154
 0    1654
-1     668
 2     437
-2      87
dtype: int64
Valid data shape: 5366, 4
Valid data per label:
 1    2827
 0    1494
-1     597
 2     378
-2      70
dtype: int64

In [ ]: