In [2]:
import numpy as np
import pandas as pd

In [3]:
# read data from stage 1 (parser)
df = pd.read_csv('2015_grants.csv',  index_col=0)
df_16 = pd.read_csv('2016_grants.csv', index_col=0)

In [4]:
# Debug Information
#print(df.shape)
#print(df_16.shape)

In [5]:
# remove all "Kleinempfänger"
little_15 = df.loc[df.name == 'Kleinempfänger']
little_16 = df_16.loc[df_16.name == 'Kleinempfänger']
df = df.loc[df.name != 'Kleinempfänger']
df_16 = df_16.loc[df_16.name != 'Kleinempfänger']

In [6]:
# Debug Information
#print(df.shape)
#print(df_16.shape)
#print(little_15.shape)
#print(little_16.shape)

In [7]:
# Remove all "strange duplicates" aka persons with more than 1 entry
df = df.drop_duplicates(["name", "plz", "place"], keep=False)
df_16 = df_16.drop_duplicates(["name", "plz", "place"], keep=False)

In [8]:
# Debug Information, complete grants in Mio. €
Total = df['Gesamt'].sum()
Total /= (10**8)
print (Total)
Total_16 = df_16['Gesamt'].sum()
Total_16 /= (10**8)
print (Total_16)


6452.10659279
5737.57592941

In [9]:
# Reorganize columns
cols = list(df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('pid')) #Remove b from list
cols.pop(cols.index('name')) #Remove x from list
cols.pop(cols.index('plz')) #Remove b from list
cols.pop(cols.index('place')) #Remove x from list
cols.pop(cols.index('Gesamt')) #Remove b from list
df = df[['pid','name', 'plz', 'place', 'Gesamt']+cols] #Create new dataframe with columns in the order you want

cols = list(df_16.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('pid')) #Remove b from list
cols.pop(cols.index('name')) #Remove x from list
cols.pop(cols.index('plz')) #Remove b from list
cols.pop(cols.index('place')) #Remove x from list
cols.pop(cols.index('Gesamt')) #Remove b from list
df_16 = df_16[['pid','name', 'plz', 'place', 'Gesamt']+cols] #Create new dataframe with columns in the order you want

In [10]:
# Convert the cents in €. TODO: fix parser to save to float as default
df.iloc[:, 4:59] = df.iloc[:, 4:59] / 100
df_16.iloc[:, 4:64] = df_16.iloc[:, 4:64] / 100

In [11]:
# Fill all NaN with zeros
df.fillna(0, inplace=True)
df_16.fillna(0, inplace=True)

In [12]:
# Create dataset for persons
# Sort dataset in a reasonable way
p1 = df.loc[:, ["pid", "name", "plz", "place"]]
p1.sort_values(["plz", "name", "pid"], inplace=True)
p1.reset_index(drop=True, inplace=True)

In [13]:
# Generate BauerID for 2015
p1.insert(0, 'ID', range(15000000, 15000000 + len(p1)))

In [14]:
# Almost same for 2016 entries
p1_16 = df_16.loc[:, ["pid", "name", "plz", "place"]]
p1_16.sort_values(["plz", "name", "pid"], inplace=True)

In [15]:
# Join both years
r1 = pd.merge(p1, p1_16, how='outer', on=['name', 'plz', 'place'])
r1.rename(columns={'pid_x': '2015', 'pid_y': '2016'}, inplace=True)

In [16]:
# Ensure that dataset is duplicate free
r1[r1.duplicated(["name", "plz", "place"], keep=False)]


Out[16]:
ID 2015 name plz place 2016

In [17]:
# Generate BauerID for ID-less entries from 2016
ids = pd.Series(range(16000000 + len(r1), 16000000, -1))
# Fill by join generated NaNs with zero
r1["ID"].fillna(value=ids, inplace=True)

In [18]:
print(r1.loc[r1.plz == 14482])


                ID                                  2015  \
10445   15010445.0  522caf0c-b1b9-4bfc-9454-51283d7ad85f   
10446   15010446.0  6da99620-ae5f-411a-b439-f71c5ede978f   
10447   15010447.0  71f6c81a-7583-45b6-b508-fad668946513   
10448   15010448.0  2c82e1e0-4123-4ce6-ba20-6022ae6e84e9   
268705  16029136.0                                   NaN   

                                          name    plz                 place  \
10445                            Freydank, Uwe  14482      Liebenwalde, Sta   
10446                        Holdt Hans-Jürgen  14482              Weitendo   
10447   Stiftung Naturlandschaften Brandenburg  14482          Potsdam, Sta   
10448                      von Bredow, Mathias  14482       Stechow-Ferches   
268705                  Erbengemeinschaft Nenz  14482  Groß Pankow (Prignit   

                                        2016  
10445   797c9d97-d05e-4bcc-92ee-35ea0a550383  
10446                                    NaN  
10447                                    NaN  
10448   078c4b37-bf9e-47a6-b65a-53396d46527c  
268705  d9b5dda8-b296-4ecb-8a3d-801a655c5e31  

In [19]:
# Reorganize columns, cast BauerID to int
r1 = r1[['ID', 'name', 'plz', 'place', '2015', '2016']]
r1["ID"] = r1["ID"].astype(int)
#print(r1.shape)
#r1.dtypes

In [20]:
# Debug Information for saved memory. TODO: refactor: don't save plz as string, use default int instead.
r2 = r1.copy()
r2.rename(columns={'plz': 'plz_i'}, inplace=True)
r2[['plz']] = r2[['plz_i']].astype(str)
r2['plz'] = r2['plz'].str.zfill(5)
mem1 = r2.memory_usage(index=True, deep=True).sum()
mem1 /= (1024 * 1024)
print(mem1)


116.324575424

In [21]:
# Debug Information
BauerID = r2.copy()
BauerID.rename(columns={'ID': 'BauerID'}, inplace=True)
BauerID = BauerID.loc[:, ["BauerID", "name", "plz", "place"]]
mem2 = BauerID.memory_usage(index=True, deep=True).sum()
mem2 /= (1024 * 1024)
print(mem2)


65.6702613831

In [22]:
# Ensure that BauerID is duplicate free
BauerID[BauerID.duplicated(["name", "plz", "place"], keep=False)]


Out[22]:
BauerID name plz place

In [23]:
# Save BauerID
BauerID.to_csv('BauerID.csv', encoding='utf-8', index=False)

In [24]:
# Generate IDLT (ID Lookup Table) to save the allocation of the former dataset to BauerID
IDLT = r2.loc[:, ["ID", "2015", "2016"]] # ID Lookup Table
IDLT.rename(columns={'ID': 'BauerID'}, inplace=True)
#print(IDLT)

In [25]:
# Save IDLT
IDLT.to_csv('IDLT.csv', encoding='utf-8', index=False)

In [26]:
# Generate final grant dataset
# Scheme: BauerID, Gesamt, [for meassure in list, append]
grant_15 = pd.merge(IDLT, df, how='inner', left_on = '2015', right_on = 'pid')

In [27]:
# Clean up final grant dataset
a = list(range(0, 61))
for i in [1, 2, 3, 4, 5, 6]:
  a.remove(i)
grant_15 = grant_15.iloc[:, a]

In [28]:
# Same for 2016
grant_16 = pd.merge(IDLT, df_16, how='inner', left_on = '2016', right_on = 'pid')
b = list(range(0, 64))
for i in [1, 2, 3, 4, 5, 6]:
  b.remove(i)
grant_16 = grant_16.iloc[:, b]

In [29]:
# Debug Information
#t2_15 = df['Gesamt'].sum()/(10**6)
#t2_16 = df_16['Gesamt'].sum()/(10**6)
#print(t2_15, t2_16)

In [30]:
# Debug Information
#t2_15 = grant_15['Gesamt'].sum()/(10**6)
#t2_16 = grant_16['Gesamt'].sum()/(10**6)
#print(t2_15, t2_16)

In [31]:
grant_16.to_csv('grant_16.csv', encoding='utf-8', index=False)
grant_15.to_csv('grant_15.csv', encoding='utf-8', index=False)