notebook.community

Edit and run



In [2]:

    
import numpy as np
import pandas as pd



In [3]:

    
# read data from stage 1 (parser)
df = pd.read_csv('2015_grants.csv',  index_col=0)
df_16 = pd.read_csv('2016_grants.csv', index_col=0)



In [4]:

    
# Debug Information
#print(df.shape)
#print(df_16.shape)



In [5]:

    
# remove all "Kleinempfänger"
little_15 = df.loc[df.name == 'Kleinempfänger']
little_16 = df_16.loc[df_16.name == 'Kleinempfänger']
df = df.loc[df.name != 'Kleinempfänger']
df_16 = df_16.loc[df_16.name != 'Kleinempfänger']



In [6]:

    
# Debug Information
#print(df.shape)
#print(df_16.shape)
#print(little_15.shape)
#print(little_16.shape)



In [7]:

    
# Remove all "strange duplicates" aka persons with more than 1 entry
df = df.drop_duplicates(["name", "plz", "place"], keep=False)
df_16 = df_16.drop_duplicates(["name", "plz", "place"], keep=False)



In [8]:

    
# Debug Information, complete grants in Mio. €
Total = df['Gesamt'].sum()
Total /= (10**8)
print (Total)
Total_16 = df_16['Gesamt'].sum()
Total_16 /= (10**8)
print (Total_16)









    



6452.10659279
5737.57592941



In [9]:

    
# Reorganize columns
cols = list(df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('pid')) #Remove b from list
cols.pop(cols.index('name')) #Remove x from list
cols.pop(cols.index('plz')) #Remove b from list
cols.pop(cols.index('place')) #Remove x from list
cols.pop(cols.index('Gesamt')) #Remove b from list
df = df[['pid','name', 'plz', 'place', 'Gesamt']+cols] #Create new dataframe with columns in the order you want

cols = list(df_16.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('pid')) #Remove b from list
cols.pop(cols.index('name')) #Remove x from list
cols.pop(cols.index('plz')) #Remove b from list
cols.pop(cols.index('place')) #Remove x from list
cols.pop(cols.index('Gesamt')) #Remove b from list
df_16 = df_16[['pid','name', 'plz', 'place', 'Gesamt']+cols] #Create new dataframe with columns in the order you want



In [10]:

    
# Convert the cents in €. TODO: fix parser to save to float as default
df.iloc[:, 4:59] = df.iloc[:, 4:59] / 100
df_16.iloc[:, 4:64] = df_16.iloc[:, 4:64] / 100



In [11]:

    
# Fill all NaN with zeros
df.fillna(0, inplace=True)
df_16.fillna(0, inplace=True)



In [12]:

    
# Create dataset for persons
# Sort dataset in a reasonable way
p1 = df.loc[:, ["pid", "name", "plz", "place"]]
p1.sort_values(["plz", "name", "pid"], inplace=True)
p1.reset_index(drop=True, inplace=True)



In [13]:

    
# Generate BauerID for 2015
p1.insert(0, 'ID', range(15000000, 15000000 + len(p1)))



In [14]:

    
# Almost same for 2016 entries
p1_16 = df_16.loc[:, ["pid", "name", "plz", "place"]]
p1_16.sort_values(["plz", "name", "pid"], inplace=True)



In [15]:

    
# Join both years
r1 = pd.merge(p1, p1_16, how='outer', on=['name', 'plz', 'place'])
r1.rename(columns={'pid_x': '2015', 'pid_y': '2016'}, inplace=True)



In [16]:

    
# Ensure that dataset is duplicate free
r1[r1.duplicated(["name", "plz", "place"], keep=False)]









    Out[16]:







  
    
      
      ID
      2015
      name
      plz
      place
      2016



In [17]:

    
# Generate BauerID for ID-less entries from 2016
ids = pd.Series(range(16000000 + len(r1), 16000000, -1))
# Fill by join generated NaNs with zero
r1["ID"].fillna(value=ids, inplace=True)



In [18]:

    
print(r1.loc[r1.plz == 14482])









    



                ID                                  2015  \
10445   15010445.0  522caf0c-b1b9-4bfc-9454-51283d7ad85f   
10446   15010446.0  6da99620-ae5f-411a-b439-f71c5ede978f   
10447   15010447.0  71f6c81a-7583-45b6-b508-fad668946513   
10448   15010448.0  2c82e1e0-4123-4ce6-ba20-6022ae6e84e9   
268705  16029136.0                                   NaN   

                                          name    plz                 place  \
10445                            Freydank, Uwe  14482      Liebenwalde, Sta   
10446                        Holdt Hans-Jürgen  14482              Weitendo   
10447   Stiftung Naturlandschaften Brandenburg  14482          Potsdam, Sta   
10448                      von Bredow, Mathias  14482       Stechow-Ferches   
268705                  Erbengemeinschaft Nenz  14482  Groß Pankow (Prignit   

                                        2016  
10445   797c9d97-d05e-4bcc-92ee-35ea0a550383  
10446                                    NaN  
10447                                    NaN  
10448   078c4b37-bf9e-47a6-b65a-53396d46527c  
268705  d9b5dda8-b296-4ecb-8a3d-801a655c5e31



In [19]:

    
# Reorganize columns, cast BauerID to int
r1 = r1[['ID', 'name', 'plz', 'place', '2015', '2016']]
r1["ID"] = r1["ID"].astype(int)
#print(r1.shape)
#r1.dtypes



In [20]:

    
# Debug Information for saved memory. TODO: refactor: don't save plz as string, use default int instead.
r2 = r1.copy()
r2.rename(columns={'plz': 'plz_i'}, inplace=True)
r2[['plz']] = r2[['plz_i']].astype(str)
r2['plz'] = r2['plz'].str.zfill(5)
mem1 = r2.memory_usage(index=True, deep=True).sum()
mem1 /= (1024 * 1024)
print(mem1)









    



116.324575424



In [21]:

    
# Debug Information
BauerID = r2.copy()
BauerID.rename(columns={'ID': 'BauerID'}, inplace=True)
BauerID = BauerID.loc[:, ["BauerID", "name", "plz", "place"]]
mem2 = BauerID.memory_usage(index=True, deep=True).sum()
mem2 /= (1024 * 1024)
print(mem2)









    



65.6702613831



In [22]:

    
# Ensure that BauerID is duplicate free
BauerID[BauerID.duplicated(["name", "plz", "place"], keep=False)]









    Out[22]:







  
    
      
      BauerID
      name
      plz
      place



In [23]:

    
# Save BauerID
BauerID.to_csv('BauerID.csv', encoding='utf-8', index=False)



In [24]:

    
# Generate IDLT (ID Lookup Table) to save the allocation of the former dataset to BauerID
IDLT = r2.loc[:, ["ID", "2015", "2016"]] # ID Lookup Table
IDLT.rename(columns={'ID': 'BauerID'}, inplace=True)
#print(IDLT)



In [25]:

    
# Save IDLT
IDLT.to_csv('IDLT.csv', encoding='utf-8', index=False)



In [26]:

    
# Generate final grant dataset
# Scheme: BauerID, Gesamt, [for meassure in list, append]
grant_15 = pd.merge(IDLT, df, how='inner', left_on = '2015', right_on = 'pid')



In [27]:

    
# Clean up final grant dataset
a = list(range(0, 61))
for i in [1, 2, 3, 4, 5, 6]:
  a.remove(i)
grant_15 = grant_15.iloc[:, a]



In [28]:

    
# Same for 2016
grant_16 = pd.merge(IDLT, df_16, how='inner', left_on = '2016', right_on = 'pid')
b = list(range(0, 64))
for i in [1, 2, 3, 4, 5, 6]:
  b.remove(i)
grant_16 = grant_16.iloc[:, b]



In [29]:

    
# Debug Information
#t2_15 = df['Gesamt'].sum()/(10**6)
#t2_16 = df_16['Gesamt'].sum()/(10**6)
#print(t2_15, t2_16)



In [30]:

    
# Debug Information
#t2_15 = grant_15['Gesamt'].sum()/(10**6)
#t2_16 = grant_16['Gesamt'].sum()/(10**6)
#print(t2_15, t2_16)



In [31]:

    
grant_16.to_csv('grant_16.csv', encoding='utf-8', index=False)
grant_15.to_csv('grant_15.csv', encoding='utf-8', index=False)