In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML
In [2]:
GOLD = "data/processed/gold/"
TEST = "data/processed/input/"
In [3]:
dev = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
In [4]:
df = pd.read_csv(train, header=None, sep="\t")
In [23]:
display(df.head())
In [9]:
print "Data shape: %s, %s" % df.shape
print "Data per label:\n%s" % df[1].value_counts()
In [13]:
df_valid = df[df[2] != "Not Available"].copy()
In [14]:
print "Valid data shape: %s, %s" % df_valid.shape
print "Valid data per label:\n%s" % df_valid[1].value_counts()
In [32]:
dev = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
for filename in [dev, devtest, train]:
display(HTML("<h3>Printing details for file: %s</h3>" % filename))
df = pd.read_csv(filename, header=None, sep="\t")
display(df.head())
print "Data shape: %s, %s" % df.shape
print "Data per label:\n%s" % df[1].value_counts()
df_valid = df[df[2] != "Not Available"].copy()
print "Valid data shape: %s, %s" % df_valid.shape
print "Valid data per label:\n%s" % df_valid[1].value_counts()
In [29]:
dev = "data/processed/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt"
for filename in [dev, devtest, train]:
display(HTML("<h3>Printing details for file: %s</h3>" % filename))
df = pd.read_csv(filename, header=None, sep="\t")
display(df.head())
print "Data shape: %s, %s" % df.shape
print "Data per label:\n%s" % df[2].value_counts()
df_valid = df[df[3] != "Not Available"].copy()
print "Valid data shape: %s, %s" % df_valid.shape
print "Valid data per label:\n%s" % df_valid[2].value_counts()
In [31]:
dev = "data/processed/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt"
devtest = "data/processed/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt"
train = "data/processed/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt"
for filename in [dev, devtest, train]:
display(HTML("<h3>Printing details for file: %s</h3>" % filename))
df = pd.read_csv(filename, header=None, sep="\t")
display(df.head())
print "Data shape: %s, %s" % df.shape
print "Data per label:\n%s" % df[2].value_counts()
df_valid = df[df[3] != "Not Available"].copy()
print "Valid data shape: %s, %s" % df_valid.shape
print "Valid data per label:\n%s" % df_valid[2].value_counts()
In [ ]: