Cross the project data creating 2 files, both with one abstract per line and each containint positive or negative ones.
This is doen to mimick rotten tomatoes data, as a test if the results port when the "reviews" are as long as project abstract.
Create both a set with onle highly positive/negative and anotehr with all positive/negative, to see how it influences.
In [26]:
#read only proejct IDs and IEG overall rating
import csv
with open('./ieg-ratings.csv', mode='r') as infile:
reader = csv.reader(infile)
next(reader, None) # skip the headers
ratings = {rows[0]:rows[22] for rows in reader}
print len(ratings), "projects in total."
In [38]:
import numpy as np
print np.unique(ratings.values())
for rating in np.unique(ratings.values()):
print rating, np.unique(ratings.values().count(rating))
There are ~2.6 more positive ones than negative ones, both when leaving out or not the modearte ones (6947/2666. all vs 4975./1872. non moderates).
In [59]:
#Get abstracts
import json
from pprint import pprint
with open('./appraisal-abstracts.json') as data_file:
abstracts = json.load(data_file)
In [79]:
pf='positive_ratings.csv'
nf='negative_ratings.csv'
of='other.csv'
outfile_map={'Highly Satisfactory':pf,
'Highly Unsatisfactory':nf,
'Moderately Satisfactory':pf,
'Moderately Unsatisfactory':nf,
'Not Applicable':of,
'Not Available':of,
'Not Rated':of,
'Satisfactory':pf,
'Unsatisfactory':nf}
with open(pf, 'w') as p, open(nf, 'w') as n, open(of, 'w') as o:
pwriter = csv.writer(p, delimiter=',')
nwriter = csv.writer(n, delimiter=',')
owriter = csv.writer(o, delimiter=',')
for p_id in abstracts.keys():
text=' '.join(abstracts[p_id].split()).replace(',',' ')
if (outfile_map[ratings[p_id]] == pf): pwriter.writerow([text])
if (outfile_map[ratings[p_id]] == nf): nwriter.writerow([text])
if (outfile_map[ratings[p_id]] == of): owriter.writerow([text])
In [ ]:
In [ ]:
In [ ]: