In [2]:
import pandas as pd

In [27]:
hours = pd.read_csv("../data/Job_Size__Per_Job_(Core_Count)__by_PI_2014-04-30_to_2015-04-30_aggregate.csv", header=6)

In [33]:
hours['School'] = pd.Series(hours["PI"].str.split('-').str[1], index=hours.index)
hours['PI'] = hours['PI'].str.split('-').str[0]

print hours


                                  PI  Job Size: Per Job (Core Count)  \
0                Di Matteo, Tiziana                            24576   
1                      Beane, Silas                            17280   
2                      Harman, Todd                            14887   
3                     Cosden, Ian A                             7304   
4                Chandran, Benjamin                             6320   
5                Williams, Samuel W                             6221   
6                   Reed, Patrick M                             5272   
7                   Kubis, Tillmann                             4705   
8                Hansen, Charles D.                             4226   
9                   Kunz, Matthew W                             4096   
10           Moschetti, Morgan Paul                             4051   
11                  Ricker, Paul M.                             3755   
12             Bellovary, Jillian M                             3670   
13                 Ballance, Connor                             3642   
14                        Wang, Bei                             3529   
15                   Oldham, Edward                             3416   
16                 Elcock, Adrian H                             3414   
17                    Soltesz, Ivan                             3145   
18                    Peng, Zhangli                             2821   
19                     Madison, Don                             2569   
20                          Liu, Keh                            2506   
21            Blaisdell, Gregory A.                             2457   
22               Windus, Theresa L.                             2400   
23                 Zingale, Michael                             2186   
24    Reynolds, Christopher Stephen                             2174   
25                    Jiang, Yanfei                             2167   
26                    Kareem, Ahsan                             2063   
27              Marinacci, Federico                             1943   
28               Srinivasan, Sundar                             1899   
29                     Lan, Zhiling                             1842   
...                              ...                             ...   
1739           Dorfman, Kevin David                                7   
1740                    Fahey, Mark                                7   
1741               Faucheaux, Jacob                                7   
1742                        Hasegawa                               7   
1743            Schanzenbach, David                                7   
1744                   Alameda, Jay                                6   
1745          Lyon, Jonathan Thomas                                6   
1746                   Nagi, Rakesh                                6   
1747     Hutchison, Geoffrey Rogers                                5   
1748                Lovitt, Charity                                4   
1749           Narayanasamy, Satish                                3   
1750                 Senocak, Inanc                                3   
1751                  Sorin, Daniel                                3   
1752                          Abbott                               2   
1753                   Babak, Tomas                                2   
1754         Krieger, Donald Nathan                                2   
1755     Schelling, Patrick Kenneth                                2   
1756           Stubbs, John Michael                                2   
1757               Chan, Yvonne Ling                               1   
1758                 Clementi, Luca                                1   
1759                  Cohen, Joseph                                1   
1760              Kannappan, Sheila                                1   
1761                    Rhee, David                                1   
1762               Schossau, Jorden                                1   
1763                  Shafi, Qaisar                                1   
1764                Siders, Paul D.                                1   
1765                Snyder, Gregory                                1   
1766                 Toonen, Robert                                1   
1767                Xie, Paul Z. R.                                1   
1768                                                             NaN   

      Std Err of Job Size: Per Job (Core Count)                      School  
0                                      0.000000                         CMU  
1                                      0.000000             U New Hampshire  
2                                    547.803924                      U Utah  
3                                   1414.881575                 Princeton U  
4                                    690.475655             U New Hampshire  
5                                   1306.184281                        LBNL  
6                                   1524.687430                   Cornell U  
7                                   1007.590318                    Purdue U  
8                                    136.020081                      U Utah  
9                                      0.000000                 Princeton U  
10                                    38.000816                        USGS  
11                                   133.289293                        UIUC  
12                                   206.426463                Vanderbilt U  
13                                  1416.889140                    Auburn U  
14                                   287.750637                 Princeton U  
15                                   620.752232           U Mary Washington  
16                                   557.394999                      U Iowa  
17                                    61.932940                   UC Irvine  
18                                   111.887045                         MIT  
19                                   153.311553                Missouri S&T  
20                                    68.029535                        Fei   
21                                    71.960076                    Purdue U  
22                                     0.000000                Iowa State U  
23                                   131.195257            SUNY Stony Brook  
24                                   168.939472                  U Maryland  
25                                   207.699668   Smithsonian Astrophys Obs  
26                                   136.435028                U Notre Dame  
27                                   264.439606                         MIT  
28                                   102.670234                U Washington  
29                                   256.707527                         IIT  
...                                         ...                         ...  
1739                                   0.786836                      U Minn  
1740                                   0.928984                 U Tennessee  
1741                                   0.539137                        UIUC  
1742                                   0.131627              Johnson, Mark   
1743                                   0.108339             U Hawaii, Manoa  
1744                                   0.372112                        UIUC  
1745                                   0.109688             Clayton State U  
1746                                   0.203830                        UIUC  
1747                                   0.425870                U Pittsburgh  
1748                                   0.229677                U Washington  
1749                                   0.000000                  U Michigan  
1750                                   0.000000               Boise State U  
1751                                   0.072014                      Duke U  
1752                                   0.441939            Lyon, Heather L   
1753                                   0.081165                  Stanford U  
1754                                   0.003594                U Pittsburgh  
1755                                   0.317639           U Central Florida  
1756                                   0.050448               U New England  
1757                                   0.000000                     Hsiang   
1758                                   0.000000                        SDSC  
1759                                   0.000000              U Mass, Boston  
1760                                   0.000000                         UNC  
1761                                   0.000000     Einstein College of Med  
1762                                   0.000000            Michigan State U  
1763                                   0.000000                  U Delaware  
1764                                   0.000000              U Minn, Duluth  
1765                                   0.000000                       STScI  
1766                                   0.000000             U Hawaii, Manoa  
1767                                   0.000000     Einstein College of Med  
1768                                        NaN                              

[1769 rows x 4 columns]

In [39]:
hours[(hours.School == 'Princeton University')]


Out[39]:
PI Job Size: Per Job (Core Count) Std Err of Job Size: Per Job (Core Count) School

In [35]:
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
plt.barh(y_pos, selected_data["Key"], align='center')
plt.yticks(y_pos, types)
plt.xlabel('Number of Publications')
plt.ylabel('Type of Publication')
plt.title('Number of Publications by Type')

In [31]:
hours["PI"].str.split('-').str[1]


Out[31]:
0                              CMU
1                  U New Hampshire
2                           U Utah
3                      Princeton U
4                  U New Hampshire
5                             LBNL
6                        Cornell U
7                         Purdue U
8                           U Utah
9                      Princeton U
10                            USGS
11                            UIUC
12                    Vanderbilt U
13                        Auburn U
14                     Princeton U
15               U Mary Washington
16                          U Iowa
17                       UC Irvine
18                             MIT
19                    Missouri S&T
20                            Fei 
21                        Purdue U
22                    Iowa State U
23                SUNY Stony Brook
24                      U Maryland
25       Smithsonian Astrophys Obs
26                    U Notre Dame
27                             MIT
28                    U Washington
29                             IIT
                   ...            
1739                        U Minn
1740                   U Tennessee
1741                          UIUC
1742                Johnson, Mark 
1743               U Hawaii, Manoa
1744                          UIUC
1745               Clayton State U
1746                          UIUC
1747                  U Pittsburgh
1748                  U Washington
1749                    U Michigan
1750                 Boise State U
1751                        Duke U
1752              Lyon, Heather L 
1753                    Stanford U
1754                  U Pittsburgh
1755             U Central Florida
1756                 U New England
1757                       Hsiang 
1758                          SDSC
1759                U Mass, Boston
1760                           UNC
1761       Einstein College of Med
1762              Michigan State U
1763                    U Delaware
1764                U Minn, Duluth
1765                         STScI
1766               U Hawaii, Manoa
1767       Einstein College of Med
1768                              
Name: PI, dtype: object

In [4]:
cols_file = open('cols.csv')
cols = cols_file.readline().strip().split(',')
cols = cols[0:2]
cols[1]=cols[1].replace(cols[1],cols[1][1:-1])
cols


Out[4]:
['PI', 'Job Size: Per Job (Core Count)']

In [12]:
size = pd.read_csv("madison_Job_Size__Per_Job_(Core_Count)__by_PI_2014-04-30_to_2015-04-30_aggregate.csv", header=7)

In [13]:
size


Out[13]:
PI Job Size: Per Job (Core Count)
0 Xia, Qian - U Wisconsin-Madison 858
1 Heinz, Sebastian - U Wisconsin-Madison 592
2 Boldyrev, Stanislav - U Wisconsin-Madison 568
3 Lazarian, Alex - U Wisconsin-Madison 474
4 Pueschel, Moritz J - U Wisconsin-Madison 282
5 Zhu, Ping - U Wisconsin-Madison 280
6 Fu, Chao - U Wisconsin-Madison 112
7 Terry, Paul W - U Wisconsin-Madison 104
8 Voyles, Paul M - U Wisconsin-Madison 90
9 Schmidt, Jordan R - U Wisconsin-Madison 53
10 Morgan, Dane - U Wisconsin-Madison 47
11 Yethiraj, Arun - U Wisconsin-Madison 28
12 Cui, Qiang - U Wisconsin-Madison 27
13 Mikulyuk, Alison F - U Wisconsin-Madison 22
14 Halzen, Francis - U Wisconsin-Madison 16
15 Miller, Ian James - U Wisconsin-Madison 16
16 Van Vleet, Mary Johanna - U Wisconsin-Madison 16

In [14]:
total = pd.merge(hours,size, on="PI")

In [15]:
type(total["PI"])


Out[15]:
pandas.core.series.Series

In [18]:
total["PI"] = total["PI"].str.split('-').str[0]

In [70]:


In [19]:
total


Out[19]:
PI CPU Hours: Total Job Size: Per Job (Core Count)
0 Morgan, Dane 3201872 47
1 Boldyrev, Stanislav 3028756 568
2 Cui, Qiang 2815000 27
3 Xia, Qian 2642461 858
4 Yethiraj, Arun 2035921 28
5 Pueschel, Moritz J 1351208 282
6 Voyles, Paul M 699044 90
7 Lazarian, Alex 494873 474
8 Schmidt, Jordan R 300689 53
9 Zhu, Ping 118841 280
10 Heinz, Sebastian 62269 592
11 Fu, Chao 54682 112
12 Van Vleet, Mary Johanna 23367 16
13 Terry, Paul W 3621 104
14 Miller, Ian James 768 16
15 Mikulyuk, Alison F 56 22
16 Halzen, Francis 53 16

In [1]:
import glob
import sys

In [5]:
date = "2014-04-30"
filenames = "*"+date+"*"
filenames


Out[5]:
'*2014-04-30*'

In [6]:
file_list = glob.glob(filenames)

In [7]:
file_list


Out[7]:
['CPU_Hours__Total__by_PI_2014-04-30_to_2015-04-30_aggregate.csv',
 'Job_Size__Per_Job_(Core_Count)__by_PI_2014-04-30_to_2015-04-30_aggregate.csv']

In [10]:
save_list = file_list[0].split('_')

In [12]:
save = "madison"
for thing in save_list[-5:]:
    save = save+"_"+thing
save


Out[12]:
'madison_PI_2014-04-30_to_2015-04-30_aggregate.csv'

In [30]:
df = pd.read_csv(file_list[0], header=7, usecols=["PI"])

In [32]:
for file in file_list:
    tmp = pd.read_csv(file, header=7)
    df = pd.merge(df,tmp, on="PI")

In [33]:
df


Out[33]:
PI CPU Hours: Total Job Size: Per Job (Core Count)
0 Morgan, Dane - U Wisconsin-Madison 3201872 47
1 Boldyrev, Stanislav - U Wisconsin-Madison 3028756 568
2 Cui, Qiang - U Wisconsin-Madison 2815000 27
3 Xia, Qian - U Wisconsin-Madison 2642461 858
4 Yethiraj, Arun - U Wisconsin-Madison 2035921 28
5 Pueschel, Moritz J - U Wisconsin-Madison 1351208 282
6 Voyles, Paul M - U Wisconsin-Madison 699044 90
7 Lazarian, Alex - U Wisconsin-Madison 494873 474
8 Schmidt, Jordan R - U Wisconsin-Madison 300689 53
9 Zhu, Ping - U Wisconsin-Madison 118841 280
10 Heinz, Sebastian - U Wisconsin-Madison 62269 592
11 Fu, Chao - U Wisconsin-Madison 54682 112
12 Van Vleet, Mary Johanna - U Wisconsin-Madison 23367 16
13 Terry, Paul W - U Wisconsin-Madison 3621 104
14 Miller, Ian James - U Wisconsin-Madison 768 16
15 Mikulyuk, Alison F - U Wisconsin-Madison 56 22
16 Halzen, Francis - U Wisconsin-Madison 53 16

In [34]:
df.to_csv("madison_2014-04-30_to_2015-04-30_aggregate.csv", index=False)

In [1]:
import pandas as pd

In [4]:
file_list = ["CPU_Hours__Total__by_PI_2015-04-01_to_2015-04-30_aggregate.csv", "Job_Size__Max_(Core_Count)__by_PI_2015-04-01_to_2015-04-30_aggregate.csv", "Job_Size__Min_(Core_Count)__by_PI_2015-04-01_to_2015-04-30_aggregate.csv", "Job_Size__Per_Job_(Core_Count)__by_PI_2015-04-01_to_2015-04-30_aggregate.csv"]

In [5]:
file_list


Out[5]:
['CPU_Hours__Total__by_PI_2015-04-01_to_2015-04-30_aggregate.csv',
 'Job_Size__Max_(Core_Count)__by_PI_2015-04-01_to_2015-04-30_aggregate.csv',
 'Job_Size__Min_(Core_Count)__by_PI_2015-04-01_to_2015-04-30_aggregate.csv',
 'Job_Size__Per_Job_(Core_Count)__by_PI_2015-04-01_to_2015-04-30_aggregate.csv']

In [12]:
df = pd.read_csv("madison_"+file_list[0], header=6, usecols=["PI"])

In [13]:
df


Out[13]:
PI
0 Pueschel, Moritz J - U Wisconsin-Madison
1 Morgan, Dane - U Wisconsin-Madison
2 Cui, Qiang - U Wisconsin-Madison
3 Yethiraj, Arun - U Wisconsin-Madison
4 Boldyrev, Stanislav - U Wisconsin-Madison
5 Heinz, Sebastian - U Wisconsin-Madison
6 Voyles, Paul M - U Wisconsin-Madison
7 Fu, Chao - U Wisconsin-Madison

In [ ]: