In [ ]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab as pl
import random
import time
import urllib.request
from urllib.request import Request
from pandas.tools.plotting import scatter_matrix
from __future__ import division
from scipy import optimize
%matplotlib inline
matplotlib.style.use('ggplot')
In [2]:
csv_links = ['https://siac.funcionpublica.gob.mx/DatosAbiertos/sanc/ProveedoresyContratistasSancionados.csv',
'http://catalogo.datos.gob.mx/dataset/procedimientos-de-contratacion-2016/resource/1108367a-9e0e-4719-95aa-29e214006380',
'http://catalogo.datos.gob.mx/dataset/procedimientos-de-contratacion-2015/resource/20b85cbd-4a8a-4c99-b14c-44f9963ea0bd',
'http://catalogo.datos.gob.mx/dataset/procedimientos-de-contratacion-2014/resource/8bf3dd44-aa2c-4307-9e4e-7063bf1ffd5a',
'http://catalogo.datos.gob.mx/dataset/procedimientos-de-contratacion-2013/resource/edf9d32e-79f1-4fc3-afc8-fedd693a91e6']
no_jalan = ['http://www.datos.economia.gob.mx/TablacomprasgobiernoMIPYMES2011.csv'] # de 2011 a 2015 no jalan
def get_db(url):
open_ = urllib.request.urlopen(Request(url))
download = s.get(CSV_URL)
decoded_content = download.content.decode('utf-8')
def get():
for i in csv_links:
get_db(i)
In [3]:
files_contratos = ['Contratos2013.csv', 'Contratos2014.csv', 'Contratos2015.csv', 'Contratos2016.csv']
files_otros = ['ProveedoresyContratistasSancionados.csv']
In [4]:
def read_csv(file_name):
df = pd.read_csv(file_name, index_col=0, header = 0, encoding = 'utf-8')
return df
In [ ]:
df = read_csv('final_project/Contratos2014.csv')
In [36]:
# Get top contratista
df.groupby('PROVEEDOR_CONTRATISTA').size().sort_values()
Out[36]:
In [37]:
# Get top contratista
df.groupby('PROVEEDOR_CONTRATISTA').sum().sort_values('IMPORTE_CONTRATO')
Out[37]:
In [38]:
def null_count(data_frame):
data_frame_long = pd.melt(data_frame)
null_count = data_frame_long.value.isnull()
return pd.crosstab(data_frame_long.variable, null_count)
def outliers(df, var_name):
return df[np.abs(df[var_name]-np.abs(df[var_name].mean())) >= 3*df[var_name].std()]
def outliers_remove(df, var_name):
return df[~(np.abs(df[var_name]-np.abs(df[var_name].mean())) >= 3*df[var_name].std())]
def df_stats(data_frame):
return data_frame.describe()
def run():
df = read_csv('final_project/Contratos2014.csv')
print('\n Null Values Count (True)', null_count(df))
return df_stats(df)
In [ ]:
In [39]:
df = run()
df.col
In [26]:
df.columns
Out[26]:
In [ ]: