In [1]:
# -*- coding: UTF-8 -*-
# Render our plots inline
%matplotlib inline
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn
import shutil
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier, overridden by seaborn
pd.set_option('display.max_columns', None) # Display all the columns
plt.rcParams['font.family'] = 'sans-serif' # Sans Serif fonts for all the graphs
# Reference for color palettes: http://web.stanford.edu/~mwaskom/software/seaborn/tutorial/color_palettes.html
# Change the font
matplotlib.rcParams.update({'font.family': 'Source Sans Pro'})
In [2]:
# Load csv file first
data = pd.read_csv("data/lab-survey.csv", encoding="utf-8")
In [3]:
# Check data
#data[0:4] # Equals to data.head()
In [4]:
# Range: D21[SQ001] - D21[SQ008] - D21[other]
lab_columns = ['D21[SQ001]','D21[SQ002]','D21[SQ003]','D21[SQ004]','D21[SQ005]','D21[SQ006]','D21[SQ007]','D21[SQ008]']
lab_options = ['Nessuna formalizzazione',
'Associazione di fatto',
'Associazione registrata',
'Fondazione',
'Srl',
'Cooperativa',
u'Unità o dipartimento di ente',
u'Unità o dipartimento di impresa']
lab = data[lab_columns]
lab.replace(u'Sì', 'Si', inplace=True) # Get rid of accented characters
lab_other = data['D21[other]'].str.lower().value_counts()
In [5]:
#places[0:4]
In [6]:
# Create all the possible combinations from the main options
# See http://stackoverflow.com/questions/17176887/python-get-all-permutation-of-a-list-w-o-repetitions
import itertools
all_combinations = {}
all_combinations_columns = []
for i in range(1, len(lab_columns)+1):
comb = list(itertools.combinations(lab_columns, i))
for k in comb:
# Each combination
all_combinations[k] = {}
all_combinations[k]["col_list"] = list(k)
# Build the string and boolean list of each combination
comb_list = []
comb_bool_list = []
# Put default False value
for l in lab_columns:
comb_bool_list.append(False)
for j in k:
pos = lab_columns.index(j) # Get position
comb_list.append(lab_options[pos])
comb_bool_list[pos] = True
all_combinations[k]["list"] = comb_list
all_combinations[k]["bool_list"] = comb_bool_list
all_combinations[k]["str"] = ", ".join(comb_list)
In [7]:
# Check which combinations correspond each row
str_values = []
for i in lab.index:
current_bool_list = list(lab.ix[i].isin(["Si"]))
for i in all_combinations:
if current_bool_list == all_combinations[i]["bool_list"]:
str_values.append(all_combinations[i]["str"])
In [8]:
# Add combination column
lab["Combination"] = pd.Series(str_values)
In [9]:
# Gather data
resulting_combinations = lab["Combination"].value_counts()
resulting_combinations_percentage = lab["Combination"].value_counts(normalize=True)*100
In [10]:
# Plotting the first 10 values of the most popular combinations
resulting_combinations[0:10].plot(kind='bar',figsize=(20,10),rot=90)
plt.title(u"Qual è la forma giuridica del laboratorio? Combinazioni", fontsize=18, y=1.02)
plt.ylabel("Lab", fontsize=16)
plt.xlabel("Combinazioni", fontsize=16)
plt.savefig("svg/Q021-Combinazioni.svg")
plt.savefig("png/Q021-Combinazioni.png")
plt.savefig("pdf/Q021-Combinazioni.pdf")
In [11]:
%%capture output
# Save the output as a variable that can be saved to a file
# Data of the combinations
print "Data:"
print resulting_combinations
print
# Data of the combinations: percentage
print "Data %:"
print resulting_combinations_percentage
In [12]:
# Save+show the output to a text file
%save Q021-Combinazioni.py str(output)
shutil.move("Q021-Combinazioni.py", "text/Q021-Combinazioni.txt")