In [2]:
%pylab


Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib

In [3]:
import matplotlib.pyplot as plt

In [4]:
import pandas as pd

In [5]:
from sentiment_demo import regularize


['conservative', 'labour', 'libdem', 'snp', 'ukip']
[0.41583143086816693, 0.4480580568720382, 0.5156653710247345, 0.4822474402730382, 0.48965395526459465]
[-0.42693501683501484, -0.4119061813186813, -0.3948863636363632, -0.43810431965442703, -0.43806816693944417]

In [6]:
import csv

In [7]:
infile = open('labeled_tweets.csv', encoding='utf8')
reader = csv.reader(infile)

In [8]:
regular = regularize(reader)

In [9]:
regular[0]


Out[9]:
['conservative', 0.1779]

In [10]:
df = pd.DataFrame(regular, columns=['party', 'score'])

In [11]:
df


Out[11]:
party score
0 conservative 0.1779
1 ukip 0.4588
2 ukip 0.3802
3 labour -0.4389
4 conservative -0.4389
5 conservative 0.0258
6 labour 0.2960
7 conservative 0.2960
8 snp -0.4019
9 labour -0.4019
10 snp 0.2732
11 labour 0.2732
12 conservative 0.2732
13 conservative 0.8126
14 snp 0.6956
15 labour 0.6956
16 snp 0.6996
17 labour 0.4019
18 conservative 0.0258
19 snp 0.2003
20 conservative 0.2003
21 libdem -0.2263
22 conservative -0.2263
23 snp 0.3182
24 labour 0.5267
25 snp 0.0772
26 labour 0.0772
27 snp -0.8225
28 labour -0.8225
29 conservative -0.8225
... ... ...
21894 labour -0.2960
21895 snp 0.3818
21896 ukip 0.0972
21897 snp 0.0516
21898 libdem 0.0516
21899 ukip 0.0516
21900 labour 0.0516
21901 conservative 0.0516
21902 conservative -0.4404
21903 labour 0.8519
21904 libdem 0.8885
21905 snp -0.4404
21906 labour -0.4404
21907 labour 0.4019
21908 conservative 0.4019
21909 ukip 0.8889
21910 ukip 0.0772
21911 ukip 0.3612
21912 labour 0.4019
21913 conservative 0.4019
21914 conservative -0.5574
21915 conservative -0.2960
21916 ukip 0.3182
21917 conservative 0.5574
21918 labour -0.7500
21919 conservative -0.7500
21920 snp -0.2960
21921 labour -0.2960
21922 ukip 0.2732
21923 snp -0.1027

21924 rows × 2 columns


In [11]:
grouped = df['score'].groupby(df['party'])

In [12]:
grouped.describe()


Out[12]:
party              
conservative  count    2135.000000
              mean        0.064120
              std         0.481144
              min        -0.982100
              25%        -0.340000
              50%         0.128000
              75%         0.473900
              max         0.957100
labour        count    1572.000000
              mean        0.049805
              std         0.477151
              min        -0.927400
              25%        -0.421500
              50%         0.140600
              75%         0.458800
              max         0.928600
libdem        count     503.000000
              mean        0.117412
              std         0.497660
              min        -0.881900
              25%        -0.361200
              50%         0.202300
              75%         0.624900
              max         0.932500
snp           count    1049.000000
              mean        0.076029
              std         0.502835
              min        -0.953800
              25%        -0.416600
              50%         0.153100
              75%         0.510600
              max         0.911700
ukip          count    3055.000000
              mean        0.118565
              std         0.505380
              min        -0.943500
              25%        -0.340000
              50%         0.275500
              75%         0.525500
              max         0.941300
dtype: float64

In [18]:
grouped.mad()


Out[18]:
party
conservative    0.414290
labour          0.428085
libdem          0.452359
snp             0.455146
ukip            0.447803
dtype: float64

In [17]:
grouped.median()


Out[17]:
party
conservative    0.1280
labour          0.1406
libdem          0.2023
snp             0.1531
ukip            0.2755
Name: score, dtype: float64

In [21]:
grouped.nlargest(20)


Out[21]:
party             
conservative  6148    0.9571
              6243    0.9571
              6310    0.9571
              6402    0.9571
              6752    0.9571
              7744    0.9571
              7776    0.9571
              8076    0.9571
              1708    0.9413
              2342    0.9410
              2716    0.9410
              7091    0.9410
              5525    0.9319
              1811    0.9186
              1513    0.9133
              1203    0.9100
              2758    0.9100
              1004    0.9062
              1488    0.9062
              1672    0.9062
labour        1296    0.9286
              2026    0.9286
              4058    0.9286
              2128    0.8910
              1171    0.8777
              805     0.8674
              939     0.8674
              4254    0.8674
              4158    0.8519
              4536    0.8519
                       ...  
snp           124     0.8814
              374     0.8814
              6849    0.8807
              2220    0.8730
              6768    0.8442
              8132    0.8360
              1929    0.8225
              1630    0.8168
              2240    0.8151
              2372    0.8151
ukip          6400    0.9413
              1766    0.9348
              6301    0.9303
              7317    0.9303
              2582    0.9222
              7327    0.9200
              4541    0.9140
              6538    0.9076
              6982    0.9076
              7089    0.9076
              7494    0.9076
              7554    0.9076
              7694    0.9076
              8141    0.9076
              8146    0.9076
              8183    0.9076
              8188    0.9076
              8276    0.9076
              81      0.9005
              8144    0.9005
dtype: float64

In [22]:
fig = plt.figure()

In [23]:
ax = fig.add_subplot(111)

In [24]:
df.plot()


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x11020e208>

In [38]:
for name, group in grouped:
    plt.plot(sorted(group), label=name)
plt.legend(loc='best')


Out[38]:
<matplotlib.legend.Legend at 0x119688b70>

In [42]:
grouped.quantile(0.5)


Out[42]:
party
conservative    0.1280
labour          0.1406
libdem          0.2023
snp             0.1531
ukip            0.2755
dtype: float64

In [62]:
for name, group in grouped:
    group.plot(kind='hist')

In [65]:
l = [group for name, group in grouped]

In [71]:
s = sorted(l[0])

In [78]:
counts = pd.Series(s)

In [83]:
plt.hist(counts.value_counts(), bins=20)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-83-52f972626bef> in <module>()
----> 1 plt.hist(counts.value_counts(), bins=20)

/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/pyplot.py in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
   2894                       histtype=histtype, align=align, orientation=orientation,
   2895                       rwidth=rwidth, log=log, color=color, label=label,
-> 2896                       stacked=stacked, **kwargs)
   2897         draw_if_interactive()
   2898     finally:

/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   5602         # Massage 'x' for processing.
   5603         # NOTE: Be sure any changes here is also done below to 'weights'
-> 5604         if isinstance(x, np.ndarray) or not iterable(x[0]):
   5605             # TODO: support masked arrays;
   5606             x = np.asarray(x)

/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
    512     def __getitem__(self, key):
    513         try:
--> 514             result = self.index.get_value(self, key)
    515 
    516             if not np.isscalar(result):

/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   2761 
   2762         k = _values_from_object(key)
-> 2763         loc = self.get_loc(k)
   2764         new_values = _values_from_object(series)[loc]
   2765 

/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/index.py in get_loc(self, key, method)
   2818         except (TypeError, NotImplementedError):
   2819             pass
-> 2820         return super(Float64Index, self).get_loc(key, method=method)
   2821 
   2822     @property

/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/index.py in get_loc(self, key, method)
   1435         """
   1436         if method is None:
-> 1437             return self._engine.get_loc(_values_from_object(key))
   1438 
   1439         indexer = self.get_indexer([key], method=method)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()

pandas/hashtable.pyx in pandas.hashtable.Float64HashTable.get_item (pandas/hashtable.c:9964)()

pandas/hashtable.pyx in pandas.hashtable.Float64HashTable.get_item (pandas/hashtable.c:9902)()

KeyError: 0.0

In [ ]: