In [1]:
%matplotlib nbagg
%pylab
In [2]:
import pandas as pd
import csv
fnames = !ls *.csv
docs = []
for fname in fnames:
with open(fname) as f:
for doc in csv.DictReader(f,delimiter=';'):
doc = {k.strip().lower().replace(' ', '_'): v.strip() for k, v in doc.iteritems()}
doc = {k: int(v) if v.isdigit() else v for k, v in doc.iteritems()}
docs.append(doc)
df = pd.DataFrame(docs)
In [3]:
from collections import defaultdict
fields = 'codigo_provincia codigo_departamento codigo_circuito'.split()
vectors = []
for group_id, locs in df.groupby(by=fields).groups.iteritems():
vector = defaultdict(int)
vector.update(dict(zip(fields, group_id)))
for _, row in df.iloc[locs].iterrows():
vector[str(row.codigo_votos)] += row.votos
vectors.append(vector)
gdf = pd.DataFrame(vectors)
gdf.to_hdf('gdf.h5py', 'gdf')
In [ ]:
import pandas as pd
gdf = pd.read_hdf('gdf.h5py', 'gdf')
In [4]:
gdf['diff'] = gdf['135'] - gdf['131']
gdf['total'] = sum(gdf[k] for k in '132 133 137 138 135 131'.split()) + 6
for k in '132 133 137 138 135 131'.split():
gdf[k + "_pct"] = (gdf[k] + 1.0) / gdf.total
gdf=gdf[gdf['135'] > 0]
gdf=gdf[gdf['131'] > 0]
In [8]:
from scipy.stats import gaussian_kde
class ConditionalDistribution(object):
def __init__(self, x1, x2, predict_resolution=100):
"""
fits x1 | x2
"""
self.x1 = x1
self.x2 = x2
self.predict_resolution = predict_resolution
ymin, ymax = np.percentile(self.x2, [1, 99])
self.y = np.linspace(ymin, ymax, predict_resolution)
self._cache = {}
def fit(self):
self.joint_estimate = gaussian_kde(np.vstack([self.x1, self.x2]))
self.cond_estimate = gaussian_kde(self.x2)
return self
def predict(self, x):
return self.y, self.joint_estimate(np.asarray([(x, e) for e in self.y]).T) / self.cond_estimate(x)
def sample(self, x):
if x not in self._cache:
y, probs = self.predict(x)
probs = np.cumsum(probs)
p = np.random.random() * probs[-1]
self._cache[x] = y[probs.searchsorted(p)]
return self._cache[x]
def draw(self, resolution=100j):
xmin, xmax = np.percentile(self.x1, [1, 99])
ymin, ymax = np.percentile(self.x2, [1, 99])
X, Y = np.mgrid[xmin:xmax:resolution, ymin:ymax:resolution]
positions = np.vstack([X.ravel(), Y.ravel()])
def draw_Z(Z):
imshow(Z, interpolation='nearest', origin='lower')
locs = np.arange(0, int(resolution.imag), int(resolution.imag) / 5)
xticks(locs, ['%.02f' % e for e in X[locs, 0].squeeze()])
yticks(locs, ['%.02f' % e for e in Y[0, locs].squeeze()])
figure()
subplot(311)
Z = np.reshape(self.joint_estimate(positions), X.shape).T
draw_Z(Z)
subplot(312)
draw_Z(Z / self.cond_estimate(Y[0]))
subplot(313)
plot(self.cond_estimate(Y[0]))
In [6]:
from collections import defaultdict
class Model(object):
def fit(self, dfX, dfy):
self.distrs = {}
for predictor, x_values in dfX.iteritems():
self.distrs[predictor] = {}
for target, y_values in dfy.iteritems():
self.distrs[predictor][target] = ConditionalDistribution(y_values, x_values).fit()
return self
def predict(self, dfX, df_cnt):
res = defaultdict(int)
for row_id, row in dfX.iterrows():
if row_id % 100 == 0:
print row_id, 'of', len(dfX)
print dict(res)
s = sum(res.values())
print {k: v/s for k, v in res.iteritems()}
for predictor, x_value in row.iteritems():
targets = {}
for target, distr in self.distrs[predictor].iteritems():
y_value = distr.sample(x_value)
targets[target] = y_value
s = sum(targets.values())
for target, value in targets.iteritems():
pred = df_cnt.iloc[row_id][predictor.replace('_pct', '')] * value / s
if pd.isnull(pred): 1/0
res[target] += pred
return dict(res)
In [9]:
for t in [0.01, 0.1, 0.5, 1]:
mask = np.random.random_sample(len(gdf)) < t
cd = ConditionalDistribution(gdf['131_pct'][mask], gdf['135_pct'][mask]).fit()
cd.draw(100j)
title(str(t))
In [10]:
dfX = gdf['132_pct 133_pct 137_pct 138_pct'.split()]
df_cnt = gdf['132 133 137 138'.split()]
dfy = gdf['131_pct 135_pct'.split()]
mask = np.random.random_sample(len(gdf)) < 10
model = Model().fit(dfX[mask], dfy[mask])
In [11]:
d = model.predict(dfX, df_cnt)
In [ ]:
debug
In [ ]:
for predictor, d in model.iteritems():
figure()
d.plot()
In [ ]:
from collections import defaultdict
ans = defaultdict(int)
discarded = 0
for _, row in gdf.iterrows():
for predictor, f in row.iteritems():
if predictor not in model: continue
f = int(f * 40) / 40.0
cnt = row[predictor.replace('_pct', '')]
if f not in model[predictor]:
discarded += cnt
continue
p = 1 / (np.exp(model[predictor][f]) + 1)
assert p <= 1
ans['131'] += p * cnt
ans['135'] += (1 - p) * cnt
# ans[target] += target_model[cnt] * cnt
In [ ]:
ans['131'] += gdf['131'].sum()
ans['135'] += gdf['135'].sum()
In [ ]:
s = sum(ans.values())
{k: v/s for k, v in ans.iteritems()}
In [ ]:
ans['135']
In [ ]:
s = 0
for k in [u'131', u'132', u'133', u'135', u'137', u'138']:
s+= gdf[k].sum()
s
In [ ]:
fields = [u'132_pct', u'133_pct', u'137_pct', u'138_pct']
X = gdf[fields]
In [ ]:
figure()
gdf.total.hist(bins=30)
In [ ]:
from sklearn.manifold import TSNE
tsne = TSNE().fit(X)