In [1]:
import pandas as pd
import json
import fnmatch
import os

In [2]:
TWEET_DATA_FILE = "TWEET_DATA.json"
LABELS_BASE_DIR = "data/2download"
OUTPUT_DIR = "data/processed"
FILENAMES = []
INPUT_FILES = []
for root, dirnames, filenames in os.walk(LABELS_BASE_DIR):
    for filename in fnmatch.filter(filenames, '*subtask*.txt'):
        FILENAMES.append(filename)
        INPUT_FILES.append(os.path.join(root, filename))

print INPUT_FILES
print FILENAMES


['data/2download/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt', 'data/2download/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt', 'data/2download/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt', 'data/2download/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt', 'data/2download/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt', 'data/2download/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt', 'data/2download/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt', 'data/2download/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt', 'data/2download/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt', 'data/2download/input/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.input.txt', 'data/2download/input/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.input.txt', 'data/2download/input/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.input.txt']
['100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt', '100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt', '100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt', '100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt', '100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt', '100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt', '100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt', '100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt', '100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt', '100_topics_100_tweets.topic-five-point.subtask-CE.devtest.input.txt', '100_topics_100_tweets.sentence-three-point.subtask-A.devtest.input.txt', '100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.input.txt']

In [3]:
tweet_data = json.load(open(TWEET_DATA_FILE))
print len(tweet_data)


9951

In [4]:
df = pd.read_csv(INPUT_FILES[0], sep="\t", header=None)

In [5]:
cols = df.columns.tolist()
cols[0] = "tid"
df.columns = cols
df.head()


Out[5]:
tid 1 2
0 637641175948763136 sony positive
1 637666734300905472 sony negative
2 637668142110654468 sony positive
3 637708370129125377 sony positive
4 637807521500020737 sony negative

In [6]:
def add_tweet(x):
    x = "%s" % x
    try:
        return tweet_data.get(x, {"text": "Not Available"})["text"].replace("\n", " ").replace("\r", " ")
    except:
        print x
        raise

In [7]:
df["text"] = df["tid"].apply(add_tweet)
df.head()


Out[7]:
tid 1 2 text
0 637641175948763136 sony positive Not Available
1 637666734300905472 sony negative Not Available
2 637668142110654468 sony positive @fakethom Have android tab and don't use phone...
3 637708370129125377 sony positive Finally I get my ps4 back I sent it to Sony ca...
4 637807521500020737 sony negative Not Available

In [8]:
df.to_csv("%s/%s" % (OUTPUT_DIR, FILENAMES[0]), sep="\t", header=None, index=False)

In [23]:
def append_tweets(input_file, output_file):
    df = pd.read_csv(input_file, sep="\t", header=None)
    cols = df.columns.tolist()
    cols[0] = "tid"
    df.columns = cols
    df["text"] = df["tid"].apply(add_tweet)
    df.to_csv(output_file, sep="\t", header=None, index=False)
    print "Wrote dataframe with shape: ", df.shape

In [10]:
append_tweets(INPUT_FILES[0], "%s/%s" % (OUTPUT_DIR, FILENAMES[0]))

In [24]:
for input_file, output_file in zip(INPUT_FILES, ["%s/%s" % (OUTPUT_DIR, k) for k in FILENAMES]):
    print "Processing %s, saving to %s" % (input_file, output_file)
    append_tweets(input_file, output_file)


Processing data/2download/gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt, saving to data/processed/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt
Wrote dataframe with shape:  (1417, 4)
Processing data/2download/gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt, saving to data/processed/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt
Wrote dataframe with shape:  (2000, 4)
Processing data/2download/gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt, saving to data/processed/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt
Wrote dataframe with shape:  (2000, 3)
Processing data/2download/gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt, saving to data/processed/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt
Wrote dataframe with shape:  (6000, 4)
Processing data/2download/gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt, saving to data/processed/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt
Wrote dataframe with shape:  (4346, 4)
Processing data/2download/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt, saving to data/processed/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt
Wrote dataframe with shape:  (6000, 3)
Processing data/2download/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt, saving to data/processed/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt
Wrote dataframe with shape:  (2000, 3)
Processing data/2download/gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt, saving to data/processed/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt
Wrote dataframe with shape:  (2000, 4)
Processing data/2download/gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt, saving to data/processed/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt
Wrote dataframe with shape:  (1325, 4)
Processing data/2download/input/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.input.txt, saving to data/processed/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.input.txt
Wrote dataframe with shape:  (2000, 3)
Processing data/2download/input/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.input.txt, saving to data/processed/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.input.txt
Wrote dataframe with shape:  (2000, 2)
Processing data/2download/input/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.input.txt, saving to data/processed/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.input.txt
Wrote dataframe with shape:  (1417, 3)

In [25]:
len(INPUT_FILES), len(FILENAMES)


Out[25]:
(12, 12)

In [26]:
len(zip(INPUT_FILES, ["%s/%s" % (OUTPUT_DIR, k) for k in FILENAMES]))


Out[26]:
12

In [27]:
%%bash
cd data/processed/
mkdir -p gold/{dev,devtest,train} input/devtest
mv *.dev.gold.txt gold/dev
mv *.devtest.gold.txt gold/devtest/
mv *.train.gold.txt gold/train/
mv *.devtest.input.txt input/devtest/
ls


gold
input

In [28]:
%%bash
cd data/processed/
find ./ -name "*.txt"


./gold/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold.txt
./gold/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold.txt
./gold/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.gold.txt
./gold/train/100_topics_100_tweets.topic-five-point.subtask-CE.train.gold.txt
./gold/train/100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold.txt
./gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt
./gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt
./gold/dev/100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold.txt
./gold/dev/100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold.txt
./input/devtest/100_topics_100_tweets.topic-five-point.subtask-CE.devtest.input.txt
./input/devtest/100_topics_100_tweets.sentence-three-point.subtask-A.devtest.input.txt
./input/devtest/100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.input.txt

In [ ]: