Some setup:
In [3]:
import pandas as pd
import numpy as np
doc = pd.read_excel('/home/rick/Downloads/ADS-GC/Portfolio/Original Assignments/GRAIN---Land-grab-deals---Jan-2012.xls')
doc[:3]
Out[3]:
In [4]:
# fix Status of deal
valid_statuses = {'done': 'done',
'suspended': 'suspended',
'proposed': 'proposed',
'in process': 'in process',
'signed': 'in process'}
def fix_status_of_deal(field: str):
for value, key in valid_statuses.items():
if value in field.lower():
return key
return field.strip()
doc['Status of deal'] = doc['Status of deal'].map(fix_status_of_deal)
doc['Status of deal'].unique()
Out[4]:
In [9]:
import pandas as pd
for column in doc.columns:
col = doc[column] # type: pd.Series
print('%s: %s' % (column, col.isnull().sum()))
# could go through the effort to replace it with 'Missing' but that is actually less useful then NaN/null
In [10]:
# Make Project investment numeric
import re
re_avg = re.compile(r'(\d+)-(\d+)(E\d+)')
def fixnumb(inp: str):
if isinstance(inp, float):
return inp
if not inp or not inp.strip():
return ''
x = inp.upper().replace('US$', '').replace(' ', '').replace(',', '.')
x = x.replace('BILLION', 'E9').replace('MILLION', 'E6')
try:
return float(x)
except ValueError as e:
if re_avg.match(x):
res = re_avg.search(x)
left, right, sin = res.groups()
left = float(left)
right = float(right)
avg = (left+right)/2
try:
return float(repr(avg) + sin)
except ValueError as e:
print('x: %r, a: %r, %s %s' % (x, avg, e, inp))
return inp
print('x: %r, %s %s' % (x, e, inp))
return inp
doc['Projected investment'] = doc['Projected investment'].map(fixnumb)
doc[:3]
Out[10]:
In [11]:
import difflib
re_split = re.compile(r'(?:,|&|;|and|\n|\([^)]+\))')
options = []
def fix_production(x: str):
# already parsed
if isinstance(x, list):
return x
# empty, integer, float, etc.
if type(x) != str:
return []
# Split the text into words, ignoring 'and' and inside braces
x = [y.strip() for y in re_split.split(x.lower()) if y.strip()]
y = []
for part in x:
# Check if we already know a similar word, if not add it otherwise use the known word
matches = difflib.get_close_matches(part, options, n=1)
if not matches:
options.append(part)
y.append(part)
else:
y.append(matches[0])
return y
doc['Production'] = doc['Production'].map(fix_production)
doc[:10]
Out[11]:
In [5]:
pd.pivot_table(doc, values=['Hectares'], index=['Base', 'Landgrabber'])
Out[5]: