Part 1 : For a single file



In [1]:

    
import os
filename = 'HCEPDB_moldata.zip'
if os.path.exists(filename):
    print('File already exists.') 
else:
    print("File doesn't exist.")









    



File already exists.



In [2]:

    
import requests 
url = 'http://faculty.washington.edu/dacb/HCEPDB_moldata.zip'
req = requests.get(url)
assert req.status_code == 200
with open(filename, 'wb') as f:
    f.write(req.content)



In [3]:

    
import zipfile
import pandas as pd
csv_filename = 'HCEPDB_moldata.csv'
zf = zipfile.ZipFile(filename)
data = pd.read_csv(zf.open(csv_filename))



In [4]:

    
data.head()









    Out[4]:






  
    
      
      id
      SMILES_str
      stoich_str
      mass
      pce
      voc
      jsc
      e_homo_alpha
      e_gap_alpha
      e_lumo_alpha
      tmp_smiles_str
    
  
  
    
      0
      655365
      C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
      C18H9N3OSSe
      394.3151
      5.161953
      0.867601
      91.567575
      -5.467601
      2.022944
      -3.444656
      C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
    
    
      1
      1245190
      C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...
      C22H15NSeSi
      400.4135
      5.261398
      0.504824
      160.401549
      -5.104824
      1.630750
      -3.474074
      C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
    
    
      2
      21847
      C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...
      C24H17NOSi
      363.4903
      0.000000
      0.000000
      197.474780
      -4.539526
      1.462158
      -3.077368
      C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
    
    
      3
      65553
      [SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1
      C12H12SeSi3
      319.4448
      6.138294
      0.630274
      149.887545
      -5.230274
      1.682250
      -3.548025
      C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
    
    
      4
      720918
      C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1
      C20H12OSSe
      379.3398
      1.991366
      0.242119
      126.581347
      -4.842119
      1.809439
      -3.032680
      C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1

Part 2 : For three or more files

Set 1: download and unzip files, and read data.

Create a list for all files, and two dictionaries to conect to their url and file name of .csv.
Check which file exists by using os.path.exists in for and if loop, and print out results.
Only download files which don't exist by putting code in else loop.
Add some print commands in the loop to show which file is downloading and tell after it is done.
Unzip the files, and use zf list and data lits to read 3 .csv files respectively.
Since 3 sets of data are the same kind of data, I first creat a blank data frame outside the for loop, and then use append command to merge all the data.
Use shape and tail command to check data.



In [5]:

    
import os
import requests
import zipfile
import pandas as pd

zipfiles = ['HCEPDB_moldata_set1.zip','HCEPDB_moldata_set2.zip','HCEPDB_moldata_set3.zip']
url = {'HCEPDB_moldata_set1.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set1.zip','HCEPDB_moldata_set2.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set2.zip','HCEPDB_moldata_set3.zip':'http://faculty.washington.edu/dacb/HCEPDB_moldata_set3.zip'}
csvfile = {'HCEPDB_moldata_set1.zip':'HCEPDB_moldata_set1.csv','HCEPDB_moldata_set2.zip':'HCEPDB_moldata_set2.csv','HCEPDB_moldata_set3.zip':'HCEPDB_moldata_set3.csv'}
zf = []
data = []
alldata = pd.DataFrame()
for i in range(len(zipfiles)):
#check whether file exists.
    if os.path.exists(zipfiles[i]):
        print(zipfiles[i],'exists.')
    else:
        print(zipfiles[i],"doesn't exist.")
#Download files.
        print(zipfiles[i],'is downloading.')
        req = requests.get(url[zipfiles[i]])
        assert req.status_code == 200
        with open(zipfiles[i], 'wb') as f:
            f.write(req.content)
        print(zipfiles[i],'is downloaded.')
#Unzip and read .csv files.    
    zf.append(zipfile.ZipFile(zipfiles[i]))
    data.append(pd.read_csv(zf[i].open(csvfile[zipfiles[i]])))
    alldata = alldata.append(data[i],ignore_index=True)
#Check data
print('\nCheck data')    
print('shape of',csvfile[zipfiles[0]],'=',data[0].shape,'\nshape of',csvfile[zipfiles[1]],'=',data[1].shape,'\nshape of',csvfile[zipfiles[2]],'=',data[2].shape, '\nshape of all data =',alldata.shape)
print('\n')
alldata.tail()









    



HCEPDB_moldata_set1.zip exists.
HCEPDB_moldata_set2.zip doesn't exist.
HCEPDB_moldata_set2.zip is downloading.
HCEPDB_moldata_set2.zip is downloaded.
HCEPDB_moldata_set3.zip exists.

Check data
shape of HCEPDB_moldata_set1.csv = (1106498, 11) 
shape of HCEPDB_moldata_set2.csv = (109854, 11) 
shape of HCEPDB_moldata_set3.csv = (1106497, 11) 
shape of all data = (2322849, 11)








    Out[5]:






  
    
      
      id
      SMILES_str
      stoich_str
      mass
      pce
      voc
      jsc
      e_homo_alpha
      e_gap_alpha
      e_lumo_alpha
      tmp_smiles_str
    
  
  
    
      2322844
      1712111
      [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
      C24H12OS6Si
      536.8398
      2.951709
      0.279912
      162.292795
      -4.879912
      1.615145
      -3.264767
      c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
    
    
      2322845
      2304057
      [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
      C22H14N4S3Si2
      486.7506
      9.335485
      1.120744
      128.197094
      -5.720744
      1.798600
      -3.922144
      c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
    
    
      2322846
      2007035
      [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
      C26H18S3Si2
      482.7982
      2.498209
      0.834995
      46.046052
      -5.434995
      2.433160
      -3.001835
      c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
    
    
      2322847
      1961981
      C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(cc1)-c1...
      C25H16S3SeSi
      519.6454
      2.679067
      0.659243
      62.544032
      -5.259243
      2.258468
      -3.000775
      c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(cc1)-...
    
    
      2322848
      2754558
      [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
      C24H13NOS5Si
      519.7887
      1.272400
      0.102802
      190.489616
      -4.702802
      1.490950
      -3.211851
      c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...

Set 2: analyza data



In [6]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
alldata['(xi-x)^2'] = (alldata['mass'] - alldata['mass'].mean())**2
SD = math.sqrt(sum(alldata['(xi-x)^2'])/alldata.shape[0])
M = alldata['mass'].mean()
print('standard diviation of mass = ',SD,', mean of mass = ',M,"\n")
alldata['mass_group'] = pd.cut(alldata['mass'],bins=[min(alldata['mass']),M-3*SD,M-2*SD,M-SD,M+SD,M+2*SD,M+3*SD,max(alldata['mass'])],labels=["<(-3SD)","-3SD~-2SD","-2SD~-SD","-SD~+SD","+SD~+2SD","+2SD~+3SD",">(+3SD)"])
count = pd.value_counts(alldata['mass_group'],normalize=True)
print("Count numbers in each group(%)\n",count,"\n")
print("within 1 standard diviation:",count[3],"\nwithin 2 standard diviation:",count[2]+count[3]+count[4],"\nwithin 3 standard diviation:",count[2]+count[3]+count[4]+count[1]+count[5],"\n")
print("Conclusions: mass is nearly normal distribution!")









    



standard diviation of mass =  59.85315778069101 , mean of mass =  415.564049928 

Count numbers in each group(%)
 -SD~+SD      0.690258
-2SD~-SD     0.139429
+SD~+2SD     0.125102
-3SD~-2SD    0.028574
+2SD~+3SD    0.014762
<(-3SD)      0.001354
>(+3SD)      0.000521
Name: mass_group, dtype: float64 

within 1 standard diviation: 0.690257821433 
within 2 standard diviation: 0.954789120941 
within 3 standard diviation: 0.998125146372 

Conclusions: mass is nearly normal distribution!

Part 3: Compare Part 1 and Part 2

In part 2, I can download mutiple files which are not exist yet, and the length of the code is almost as much as part 1, which means it's much shorther than to replicate codes for 3 times. Furthermore, I just have to add a new file to the list zipfiles by append command, and add its url and .csv filename to dictionaries if there are new collected data files which need to be downloaded. The rest parts of codes are unchanged, which makes it easy to maintain.



In [ ]:

	id	SMILES_str	stoich_str	mass	pce	voc	jsc	e_homo_alpha	e_gap_alpha	e_lumo_alpha	tmp_smiles_str
0	655365	C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1	C18H9N3OSSe	394.3151	5.161953	0.867601	91.567575	-5.467601	2.022944	-3.444656	C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1	1245190	C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...	C22H15NSeSi	400.4135	5.261398	0.504824	160.401549	-5.104824	1.630750	-3.474074	C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
2	21847	C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...	C24H17NOSi	363.4903	0.000000	0.000000	197.474780	-4.539526	1.462158	-3.077368	C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
3	65553	[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1	C12H12SeSi3	319.4448	6.138294	0.630274	149.887545	-5.230274	1.682250	-3.548025	C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
4	720918	C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1	C20H12OSSe	379.3398	1.991366	0.242119	126.581347	-4.842119	1.809439	-3.032680	C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1

	id	SMILES_str	stoich_str	mass	pce	voc	jsc	e_homo_alpha	e_gap_alpha	e_lumo_alpha	tmp_smiles_str
2322844	1712111	[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...	C24H12OS6Si	536.8398	2.951709	0.279912	162.292795	-4.879912	1.615145	-3.264767	c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
2322845	2304057	[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...	C22H14N4S3Si2	486.7506	9.335485	1.120744	128.197094	-5.720744	1.798600	-3.922144	c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322846	2007035	[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...	C26H18S3Si2	482.7982	2.498209	0.834995	46.046052	-5.434995	2.433160	-3.001835	c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322847	1961981	C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(cc1)-c1...	C25H16S3SeSi	519.6454	2.679067	0.659243	62.544032	-5.259243	2.258468	-3.000775	c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(cc1)-...
2322848	2754558	[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...	C24H13NOS5Si	519.7887	1.272400	0.102802	190.489616	-4.702802	1.490950	-3.211851	c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...