In [1]:
import os
filename = 'HCEPDB_moldata.zip'
if os.path.exists(filename):
print('File already exists.')
else:
print("File doesn't exist.")
In [2]:
import requests
url = 'http://faculty.washington.edu/dacb/HCEPDB_moldata.zip'
req = requests.get(url)
assert req.status_code == 200
with open(filename, 'wb') as f:
f.write(req.content)
In [3]:
import zipfile
import pandas as pd
csv_filename = 'HCEPDB_moldata.csv'
zf = zipfile.ZipFile(filename)
data = pd.read_csv(zf.open(csv_filename))
In [4]:
data.head()
Out[4]:
.csv
. os.path.exists
in for
and if
loop, and print out results.else
loop.print
commands in the loop to show which file is downloading and tell after it is done.zf
list and data
lits to read 3 .csv
files respectively.for
loop, and then use append
command to merge all the data.shape
and tail
command to check data.
In [5]:
import os
import requests
import zipfile
import pandas as pd
zipfiles = ['HCEPDB_moldata_set1.zip','HCEPDB_moldata_set2.zip','HCEPDB_moldata_set3.zip']
url = {'HCEPDB_moldata_set1.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set1.zip','HCEPDB_moldata_set2.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set2.zip','HCEPDB_moldata_set3.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set3.zip'}
csvfile = {'HCEPDB_moldata_set1.zip':'HCEPDB_moldata_set1.csv','HCEPDB_moldata_set2.zip':'HCEPDB_moldata_set2.csv','HCEPDB_moldata_set3.zip':'HCEPDB_moldata_set3.csv'}
zf = []
data = []
alldata = pd.DataFrame()
for i in range(len(zipfiles)):
#check whether file exists.
if os.path.exists(zipfiles[i]):
print(zipfiles[i],'exists.')
else:
print(zipfiles[i],"doesn't exist.")
#Download files.
print(zipfiles[i],'is downloading.')
req = requests.get(url[zipfiles[i]])
assert req.status_code == 200
with open(zipfiles[i], 'wb') as f:
f.write(req.content)
print(zipfiles[i],'is downloaded.')
#Unzip and read .csv files.
zf.append(zipfile.ZipFile(zipfiles[i]))
data.append(pd.read_csv(zf[i].open(csvfile[zipfiles[i]])))
alldata = alldata.append(data[i],ignore_index=True)
#Check data
print('\nCheck data')
print('shape of',csvfile[zipfiles[0]],'=',data[0].shape,'\nshape of',csvfile[zipfiles[1]],'=',data[1].shape,'\nshape of',csvfile[zipfiles[2]],'=',data[2].shape, '\nshape of all data =',alldata.shape)
print('\n')
alldata.tail()
Out[5]:
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
alldata['(xi-x)^2'] = (alldata['mass'] - alldata['mass'].mean())**2
SD = math.sqrt(sum(alldata['(xi-x)^2'])/alldata.shape[0])
M = alldata['mass'].mean()
print('standard diviation of mass = ',SD,', mean of mass = ',M,"\n")
alldata['mass_group'] = pd.cut(alldata['mass'],bins=[min(alldata['mass']),M-3*SD,M-2*SD,M-SD,M+SD,M+2*SD,M+3*SD,max(alldata['mass'])],labels=["<(-3SD)","-3SD~-2SD","-2SD~-SD","-SD~+SD","+SD~+2SD","+2SD~+3SD",">(+3SD)"])
count = pd.value_counts(alldata['mass_group'],normalize=True)
print("Count numbers in each group(%)\n",count,"\n")
print("within 1 standard diviation:",count[3],"\nwithin 2 standard diviation:",count[2]+count[3]+count[4],"\nwithin 3 standard diviation:",count[2]+count[3]+count[4]+count[1]+count[5],"\n")
print("Conclusions: mass is nearly normal distribution!")
In part 2, I can download mutiple files which are not exist yet, and the length of the code is almost as much as part 1, which means it's much shorther than to replicate codes for 3 times. Furthermore, I just have to add a new file to the list zipfiles
by append
command, and add its url and .csv filename to dictionaries if there are new collected data files which need to be downloaded. The rest parts of codes are unchanged, which makes it easy to maintain.
In [ ]: