In [1]:
# https://github.com/fnielsen/everything
from everything import *
In [2]:
# Read dataframe with features for companies
filename = expanduser('~/workspace/cvrminer/virksomheder-features.csv')
df = read_csv(filename, encoding='utf-8', index_col=0)
In [3]:
# Feature names
df.columns
Out[3]:
In [4]:
# Functions for conversion to numerical dataframes
def to_dummies(df, column):
datatype = df[column].dtypes
if datatype in [int64, float64]:
return df[[column]]
elif datatype == bool:
return df[[column]].astype(int)
elif datatype == 'object':
df_column = df[column].str.get_dummies()
df_column.columns = [column + ":" + col for col in df_column.columns]
return df_column
else:
raise ValueError('Unrecognized datatype for column {}'.format(column))
def dataframe_to_numerical(df):
df_numerical = DataFrame(index=df.index)
for column in df.columns:
print(column)
df_numerical = df_numerical.join(to_dummies(df, column))
return df_numerical
In [5]:
# Numerical dataframe
dfn = dataframe_to_numerical(df)
dfn.shape
Out[5]:
In [6]:
# Preprocessing
imputer = Imputer()
scaler = StandardScaler(with_mean=False)
dfni = DataFrame(scaler.fit_transform(imputer.fit_transform(dfn)), columns=dfn.columns, index=dfn.index)
In [7]:
df.nyeste_statuskode.value_counts()
Out[7]:
In [8]:
df.sammensat_status.value_counts()
Out[8]:
In [9]:
pd.crosstab(df.sammensat_status, df.nyeste_statuskode)
Out[9]:
In [10]:
indices = df.sammensat_status.isin(['Aktiv', u'OPLØSTEFTERKONKURS']).values
dfs = df.ix[indices, :].copy()
dfs.shape
Out[10]:
In [11]:
dfs['konkurs'] = (dfs.sammensat_status == u'OPLØSTEFTERKONKURS').astype(int)
In [12]:
def transform_year(year):
return year - 2000
results = smf.glm(('konkurs ~ np.log(antal_penheder+1) + C(nyeste_antal_ansatte) + '
# nyeste_virksomhedsform + nyeste_statuskode +
'branche_ansvarskode + reklamebeskyttet + transform_year(stiftelsesaar)'),
data=dfs, family=sm.families.Binomial()).fit()
In [13]:
print(results.summary())
In [14]:
actives = dfs.sammensat_status.isin(['Aktiv']).values
y_est = results.predict(exog=dfs.ix[actives, :])
indices = argsort(-y_est)
In [15]:
# dfs.ix[actives, :].iloc[indices, :].head(100)
In [ ]: