In [1]:
import os
filename = 'HCEPDB_moldata.zip'
if os.path.exists(filename):
print('File already exists.')
else:
print("File doesn't exist.")
In [2]:
import requests
url = 'http://faculty.washington.edu/dacb/HCEPDB_moldata.zip'
req = requests.get(url)
assert req.status_code == 200
with open(filename, 'wb') as f:
f.write(req.content)
In [3]:
import zipfile
import pandas as pd
csv_filename = 'HCEPDB_moldata.csv'
zf = zipfile.ZipFile(filename)
data = pd.read_csv(zf.open(csv_filename))
In [4]:
data.head()
Out[4]:
.csv. os.path.exists in for and if loop, and print out results.else loop.print commands in the loop to show which file is downloading and tell after it is done.zf list and data lits to read 3 .csv files respectively.for loop, and then use append command to merge all the data.shape and tail command to check data.
In [5]:
import os
import requests
import zipfile
import pandas as pd
zipfiles = ['HCEPDB_moldata_set1.zip','HCEPDB_moldata_set2.zip','HCEPDB_moldata_set3.zip']
url = {'HCEPDB_moldata_set1.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set1.zip','HCEPDB_moldata_set2.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set2.zip','HCEPDB_moldata_set3.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set3.zip'}
csvfile = {'HCEPDB_moldata_set1.zip':'HCEPDB_moldata_set1.csv','HCEPDB_moldata_set2.zip':'HCEPDB_moldata_set2.csv','HCEPDB_moldata_set3.zip':'HCEPDB_moldata_set3.csv'}
zf = []
data = []
alldata = pd.DataFrame()
for i in range(len(zipfiles)):
#check whether file exists.
if os.path.exists(zipfiles[i]):
print(zipfiles[i],'exists.')
else:
print(zipfiles[i],"doesn't exist.")
#Download files.
print(zipfiles[i],'is downloading.')
req = requests.get(url[zipfiles[i]])
assert req.status_code == 200
with open(zipfiles[i], 'wb') as f:
f.write(req.content)
print(zipfiles[i],'is downloaded.')
#Unzip and read .csv files.
zf.append(zipfile.ZipFile(zipfiles[i]))
data.append(pd.read_csv(zf[i].open(csvfile[zipfiles[i]])))
alldata = alldata.append(data[i],ignore_index=True)
#Check data
print('\nCheck data')
print('shape of',csvfile[zipfiles[0]],'=',data[0].shape,'\nshape of',csvfile[zipfiles[1]],'=',data[1].shape,'\nshape of',csvfile[zipfiles[2]],'=',data[2].shape, '\nshape of all data =',alldata.shape)
print('\n')
alldata.tail()
Out[5]:
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
alldata['(xi-x)^2'] = (alldata['mass'] - alldata['mass'].mean())**2
SD = math.sqrt(sum(alldata['(xi-x)^2'])/alldata.shape[0])
M = alldata['mass'].mean()
print('standard diviation of mass = ',SD,', mean of mass = ',M,"\n")
alldata['mass_group'] = pd.cut(alldata['mass'],bins=[min(alldata['mass']),M-3*SD,M-2*SD,M-SD,M+SD,M+2*SD,M+3*SD,max(alldata['mass'])],labels=["<(-3SD)","-3SD~-2SD","-2SD~-SD","-SD~+SD","+SD~+2SD","+2SD~+3SD",">(+3SD)"])
count = pd.value_counts(alldata['mass_group'],normalize=True)
print("Count numbers in each group(%)\n",count,"\n")
print("within 1 standard diviation:",count[3],"\nwithin 2 standard diviation:",count[2]+count[3]+count[4],"\nwithin 3 standard diviation:",count[2]+count[3]+count[4]+count[1]+count[5],"\n")
print("Conclusions: mass is nearly normal distribution!")
In part 2, I can download mutiple files which are not exist yet, and the length of the code is almost as much as part 1, which means it's much shorther than to replicate codes for 3 times. Furthermore, I just have to add a new file to the list zipfiles by append command, and add its url and .csv filename to dictionaries if there are new collected data files which need to be downloaded. The rest parts of codes are unchanged, which makes it easy to maintain.
In [ ]: