In [75]:

    
import pandas as pd
import json
import matplotlib.pyplot as plt
import pdb
import numpy as np
import datetime
file_json = open('closed.json')
data_json = json.load(file_json)
data_df = pd.DataFrame(data_json, copy=True)
git_df_temp = data_df[['title', 'created_at', 'labels', 'closed_at', 'id']]
git_user_temp = data_df['user']
value_list = []
for i, row_entry in git_user_temp.iteritems():
    val = row_entry['login']
    value_list.append(val)
user_df = pd.Series(value_list,index=git_user_temp.index)
user_df.name = 'user'
git_df = git_df_temp.join(user_df)



In [110]:

    
data_df.user.head()









    Out[110]:





0    {u'following_url': u'https://api.github.com/us...
1    {u'following_url': u'https://api.github.com/us...
2    {u'following_url': u'https://api.github.com/us...
3    {u'following_url': u'https://api.github.com/us...
4    {u'following_url': u'https://api.github.com/us...
Name: user, dtype: object

Question



In [90]:

    
git_df









    Out[90]:




<class 'pandas.core.frame.DataFrame'>
Int64Index: 2968 entries, 0 to 2967
Data columns (total 6 columns):
title         2968  non-null values
created_at    2968  non-null values
labels        2968  non-null values
closed_at     2968  non-null values
id            2968  non-null values
user          2968  non-null values
dtypes: int64(1), object(5)



In [40]:

    
git_df_2 = pd.concat([git_df_temp,user_df],axis=1)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-40-c1fcf8b76128> in <module>()
----> 1 git_df_2 = pd.concat([git_df_temp,user_df],axis=1)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity)
    876                        ignore_index=ignore_index, join=join,
    877                        keys=keys, levels=levels, names=names,
--> 878                        verify_integrity=verify_integrity)
    879     return op.get_result()
    880 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity)
    952         self.verify_integrity = verify_integrity
    953 
--> 954         self.new_axes = self._get_new_axes()
    955 
    956     def get_result(self):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
   1129                 if i == self.axis:
   1130                     continue
-> 1131                 new_axes[i] = self._get_comb_axis(i)
   1132         else:
   1133             if not ((len(self.join_axes) == ndim - 1)):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
   1154             all_indexes = [x.index for x in self.objs]
   1155         else:
-> 1156             all_indexes = [x._data.axes[i] for x in self.objs]
   1157 
   1158         return _get_combined_index(all_indexes, intersect=self.intersect)

AttributeError: 'Series' object has no attribute '_data'



In [76]:

    
git_df.drop_duplicates(cols='id',inplace=True)
git_df['created_at'] = pd.to_datetime(git_df['created_at'])
git_df['closed_at'] = pd.to_datetime(git_df['closed_at'])
git_df.set_index('created_at', inplace=True, drop=False)

(5)



In [82]:

    
git_df.head()









    Out[82]:






  
    
      
      title
      created_at
      labels
      closed_at
      id
      user
    
    
      created_at
      
      
      
      
      
      
    
  
  
    
      2010-09-29 00:45:31
       Enable element-wise comparison operations in D...
      2010-09-29 00:45:31
       []
      2011-02-19 23:13:48
       337721
       wesm
    
    
      2010-09-29 00:50:13
                                   reindex_like function
      2010-09-29 00:50:13
       []
      2010-12-17 02:57:33
       337726
       wesm
    
    
      2010-09-29 00:50:52
                     Binary operations on int DataMatrix
      2010-09-29 00:50:52
       []
      2011-01-01 23:50:12
       337728
       wesm
    
    
      2010-09-29 00:51:27
       Plot keyword arguments are unused in DataFrame...
      2010-09-29 00:51:27
       []
      2010-12-11 06:14:32
       337730
       wesm
    
    
      2010-09-29 00:57:00
                                      Python 2.7 testing
      2010-09-29 00:57:00
       []
      2010-12-17 02:46:34
       337736
       wesm



In [73]:

    
def count(created_in):
    return len(created_in)



In [68]:

    
git_df.set_index('created_at', inplace = True)



In [72]:

    
git_df.









    Out[72]:





created_at
2010-09-29T00:45:31Z    Enable element-wise comparison operations in D...
2010-09-29T00:45:31Z    Enable element-wise comparison operations in D...
2010-09-29T00:50:13Z                                reindex_like function
2010-09-29T00:50:13Z                                reindex_like function
2010-09-29T00:50:52Z                  Binary operations on int DataMatrix
Name: title, dtype: object



In [79]:

    
issue_df = git_df.title.resample('M', how= count)
issue_df.name = 'Number of issues'
plt.figure()
issue_ax = issue_df.plot()
issue_ax.set_ylabel(issue_df.name)









    Out[79]:





<matplotlib.text.Text at 0x10eea2790>



In [78]:

    
%matplotlib









    



Using matplotlib backend: MacOSX



In [85]:

    
def distinct_user(resample_input):
    #print resample_input.user
    distinct_input = resample_input.drop_duplicates()
    #print distinct_input.user
    return len(distinct_input.index)
user_monthly = git_df.user.resample('M', how = distinct_user)
user_monthly.name = 'Number of distinct user'
plt.figure()
user_ax = user_monthly.plot()
user_ax.set_ylabel(user_monthly.name)









    Out[85]:





<matplotlib.text.Text at 0x10eea2950>



In [86]:

    
import datetime



In [87]:

    
def mean_day(close_at_input):
    try:
        create_at_series = pd.Series(close_at_input.index, index = close_at_input.index, name = close_at_input.index.name)
        diff = (close_at_input - create_at_series)
        sec_mean = diff.sum()/(len(diff))
    #pdb.set_trace()
        return (datetime.timedelta(microseconds = int(sec_mean/1000))).days
    except:
        return np.nan
    
open_day = git_df.closed_at.resample('M', mean_day)
open_day.name = 'mean_days'
open_day_nissue = pd.concat([issue_df, open_day], axis=1)
plt.figure()
line1 = open_day_nissue['Number of issues'].plot()
line1.set_ylabel('Number of issues')
plt.figure()
line2 = open_day_nissue['mean_days'].plot()
line2.set_ylabel('mean_days')









    Out[87]:





<matplotlib.text.Text at 0x113711c50>

Table for (6)



In [88]:

    
open_day_nissue.head()









    Out[88]:






  
    
      
      Number of issues
      mean_days
    
    
      created_at
      
      
    
  
  
    
      2010-09-30
       11
       138
    
    
      2010-10-31
        8
       250
    
    
      2010-11-30
        2
        13
    
    
      2010-12-31
        4
         2
    
    
      2011-01-31
        9
        52

P(7)



In [79]:

    
comments_df = data_df[['created_at','comments',  'id' ]]
#print comments_df.head()
comments_df.drop_duplicates(cols='id', inplace=True)



In [50]:

    
comments_df.comments[100][0]['text']









    Out[50]:





u'duplicate issue'



In [80]:

    
def comment_func(comment_list):
    try:
        return comment_list[0]['text']
    except:
        return np.nan
comments_df.comments = comments_df.comments.map(comment_func)



In [81]:

    
comments_df.head()









    Out[81]:






  
    
      
      created_at
      comments
      id
    
  
  
    
      0
       2010-09-29T00:45:31Z
                                 implemented in git HEAD
       337721
    
    
      2
       2010-09-29T00:50:13Z
                                                    done
       337726
    
    
      4
       2010-09-29T00:50:52Z
       I guess I &quot;accidentally&quot; fixed this ...
       337728
    
    
      6
       2010-09-29T00:51:27Z
                                                   fixed
       337730
    
    
      8
       2010-09-29T00:57:00Z
       Everything seems to be working in Python 2.7 w...
       337736

p(8)



In [2]:

    
chattiest_df = git_df[['created_at', 'user', 'id']]



In [3]:

    
chattiest_df.head()









    Out[3]:






  
    
      
      created_at
      user
      id
    
  
  
    
      0
       2010-09-29T00:45:31Z
       wesm
       337721
    
    
      1
       2010-09-29T00:45:31Z
       wesm
       337721
    
    
      2
       2010-09-29T00:50:13Z
       wesm
       337726
    
    
      3
       2010-09-29T00:50:13Z
       wesm
       337726
    
    
      4
       2010-09-29T00:50:52Z
       wesm
       337728



In [4]:

    
chattiest_df['created_at'] = pd.to_datetime(chattiest_df['created_at'])
chattiest_df.set_index('created_at', inplace=True)



In [5]:

    
chattiest_df.head()









    Out[5]:






  
    
      
      user
      id
    
    
      created_at
      
      
    
  
  
    
      2010-09-29 00:45:31
       wesm
       337721
    
    
      2010-09-29 00:45:31
       wesm
       337721
    
    
      2010-09-29 00:50:13
       wesm
       337726
    
    
      2010-09-29 00:50:13
       wesm
       337726
    
    
      2010-09-29 00:50:52
       wesm
       337728



In [100]:

    
del chattiest_df['id']



In [102]:

    
chattiest_df.head()









    Out[102]:






  
    
      
      user
    
    
      created_at
      
    
  
  
    
      2010-09-29 00:45:31
       wesm
    
    
      2010-09-29 00:45:31
       wesm
    
    
      2010-09-29 00:50:13
       wesm
    
    
      2010-09-29 00:50:13
       wesm
    
    
      2010-09-29 00:50:52
       wesm



In [6]:

    
chattiest_se = chattiest_df['user']



In [8]:

    
def distinct(in_df):
    return len(in_df.unique())
nDistinct_se = chattiest_se.resample('M', how = distinct)
nDistinct_se.head()









    Out[8]:





created_at
2010-09-30    2
2010-10-31    3
2010-11-30    2
2010-12-31    3
2011-01-31    5
Freq: M, dtype: int64



In [112]:

    
chattiest_se.head()









    Out[112]:





created_at
2010-09-29 00:45:31    wesm
2010-09-29 00:45:31    wesm
2010-09-29 00:50:13    wesm
2010-09-29 00:50:13    wesm
2010-09-29 00:50:52    wesm
Name: user, dtype: object



In [113]:

    
chattiest_se.resample('M', how = [distinct, percent, chattiest_user, count])









    Out[113]:






  
    
      
      distinct
      percent
      chattiest_user
      count
    
    
      created_at
      
      
      
      
    
  
  
    
      2010-09-30
        2
       0.818182
               wesm
        22
    
    
      2010-10-31
        3
       0.750000
               wesm
        16
    
    
      2010-11-30
        2
       0.500000
           mpenning
         4
    
    
      2010-12-31
        3
       0.500000
                knm
         8
    
    
      2011-01-31
        5
       0.333333
        triplechess
        18
    
    
      2011-02-28
        2
       0.500000
              ghost
         2
    
    
      2011-03-31
        1
       1.000000
              ghost
         2
    
    
      2011-04-30
        0
            NaN
                NaN
         0
    
    
      2011-05-31
        3
       0.714286
               wesm
         7
    
    
      2011-06-30
        3
       0.777778
               wesm
         9
    
    
      2011-07-31
        9
       0.733333
               wesm
        30
    
    
      2011-08-31
       10
       0.677419
               wesm
        31
    
    
      2011-09-30
       14
       0.712121
               wesm
        66
    
    
      2011-10-31
       17
       0.661017
               wesm
       118
    
    
      2011-11-30
       25
       0.536364
               wesm
       110
    
    
      2011-12-31
       22
       0.496000
               wesm
       125
    
    
      2012-01-31
       41
       0.363636
               wesm
       154
    
    
      2012-02-29
       26
       0.485149
               wesm
       101
    
    
      2012-03-31
       47
       0.261905
               wesm
       126
    
    
      2012-04-30
       39
       0.549708
               wesm
       171
    
    
      2012-05-31
       26
       0.497207
               wesm
       179
    
    
      2012-06-30
       55
       0.284024
               wesm
       169
    
    
      2012-07-31
       52
       0.290780
               wesm
       141
    
    
      2012-08-31
       46
       0.137255
               wesm
       102
    
    
      2012-09-30
       53
       0.129496
       changhiskhan
       139
    
    
      2012-10-31
       47
       0.141593
                y-p
       113
    
    
      2012-11-30
       63
       0.245370
                y-p
       216
    
    
      2012-12-31
       66
       0.154286
                y-p
       175
    
    
      2013-01-31
       55
       0.130769
            jreback
       130
    
    
      2013-02-28
       56
       0.192308
            jreback
       130
    
    
      2013-03-31
       56
       0.363208
            jreback
       212
    
    
      2013-04-30
       49
       0.274648
                y-p
       142



In [29]:

    
def percent(in_se):
    try:
        counts = in_se.value_counts().order(ascending = False)
        return counts.ix[0]/float(counts.sum())
    except:
        return np.nan
percent_se = 100*chattiest_se.resample('M', how = percent)



In [14]:

    
def chattiest_user(in_se):
    try:
        counts = in_se.value_counts().order(ascending = False)
        return counts.index[0]
    except:
        return np.nan
chatUser_se = chattiest_se.resample('M', how = chattiest_user)



In [15]:

    
chatUser_se.head()









    Out[15]:





created_at
2010-09-30           wesm
2010-10-31           wesm
2010-11-30       mpenning
2010-12-31            knm
2011-01-31    triplechess
Freq: M, dtype: object



In [21]:

    
nComments_se = chattiest_se.resample('M', how = count)
print nComments_se.head()









    



created_at
2010-09-30    22
2010-10-31    16
2010-11-30     4
2010-12-31     8
2011-01-31    18
Freq: M, dtype: int64



In [30]:

    
user = pd.DataFrame({'nIssue':nComments_se, 'chattiest':chatUser_se,\
                     'percentage of the chattiest':percent_se, 'nParticipants':nDistinct_se},\
                    columns = ['nParticipants', 'nIssue', 'chattiest', 'percentage of the chattiest'])



In [117]:

    
user.columns









    Out[117]:





Index([u'nParticipants', u'nIssue', u'chattiest', u'percentage of the chattiest'], dtype=object)

P(9)



In [98]:

    
id_labels_df = data_df[['id', 'labels', 'created_at']].drop_duplicates(cols = 'id', inplace = False)



In [99]:

    
id_labels_df.columns









    Out[99]:





Index([u'id', u'labels', u'created_at'], dtype=object)



In [101]:

    
id_labels_list = []
for (idx,Id) in id_labels_df.id.iteritems():
    if len(id_labels_df.labels.ix[idx]):
        for label_dict in id_labels_df.labels.ix[idx]:
            id_labels_list.append((Id, label_dict['name'], id_labels_df.created_at[idx]))
    else:
        id_labels_list.append((Id, np.nan,id_labels_df.created_at[idx]))



In [102]:

    
id_labels_df = pd.DataFrame(id_labels_list, columns=['id', 'label', 'created_at'])



In [104]:

    
id_labels_df['created_at'] = pd.to_datetime(id_labels_df['created_at'])



In [106]:

    
id_labels_df.set_index('created_at', inplace=True)



In [107]:

    
id_labels_df.head()









    Out[107]:






  
    
      
      id
      label
    
    
      created_at
      
      
    
  
  
    
      2010-09-29 00:45:31
       337721
       NaN
    
    
      2010-09-29 00:50:13
       337726
       NaN
    
    
      2010-09-29 00:50:52
       337728
       NaN
    
    
      2010-09-29 00:51:27
       337730
       NaN
    
    
      2010-09-29 00:57:00
       337736
       NaN



In [108]:

    
issue_df









    Out[108]:





created_at
2010-09-30     11
2010-10-31      8
2010-11-30      2
2010-12-31      4
2011-01-31      9
2011-02-28      2
2011-03-31      2
2011-04-30      0
2011-05-31      7
2011-06-30      9
2011-07-31     30
2011-08-31     31
2011-09-30     66
2011-10-31    118
2011-11-30    110
2011-12-31    125
2012-01-31    154
2012-02-29    101
2012-03-31    126
2012-04-30    171
2012-05-31    179
2012-06-30    169
2012-07-31    141
2012-08-31    102
2012-09-30    139
2012-10-31    113
2012-11-30    216
2012-12-31    175
2013-01-31    130
2013-02-28    130
2013-03-31    212
2013-04-30    142
Freq: M, Name: Number of issues, dtype: int64



In [56]:

    
git_df.labels.ix[2956]









    Out[56]:





[{u'color': u'e10c02',
  u'name': u'Bug',
  u'url': u'https://api.github.com/repos/pydata/pandas/labels/Bug'},
 {u'color': u'0b02e1',
  u'name': u'Indexing',
  u'url': u'https://api.github.com/repos/pydata/pandas/labels/Indexing'},
 {u'color': u'e102d8',
  u'name': u'Dtypes',
  u'url': u'https://api.github.com/repos/pydata/pandas/labels/Dtypes'}]



In [54]:

    
data_df.columns









    Out[54]:





Index([u'assignee', u'body', u'closed_at', u'comments', u'comments_url', u'created_at', u'events_url', u'html_url', u'id', u'labels', u'labels_url', u'milestone', u'number', u'pull_request', u'state', u'title', u'updated_at', u'url', u'user'], dtype=object)



In [81]:

    
user









    Out[81]:






  
    
      
      nParticipants
      nIssue
      chattiest
      percentage of the chattiest
    
    
      created_at
      
      
      
      
    
  
  
    
      2010-09-30
        2
        22
               wesm
        81.818182
    
    
      2010-10-31
        3
        16
               wesm
        75.000000
    
    
      2010-11-30
        2
         4
           mpenning
        50.000000
    
    
      2010-12-31
        3
         8
                knm
        50.000000
    
    
      2011-01-31
        5
        18
        triplechess
        33.333333
    
    
      2011-02-28
        2
         2
              ghost
        50.000000
    
    
      2011-03-31
        1
         2
              ghost
       100.000000
    
    
      2011-04-30
        0
         0
                NaN
              NaN
    
    
      2011-05-31
        3
         7
               wesm
        71.428571
    
    
      2011-06-30
        3
         9
               wesm
        77.777778
    
    
      2011-07-31
        9
        30
               wesm
        73.333333
    
    
      2011-08-31
       10
        31
               wesm
        67.741935
    
    
      2011-09-30
       14
        66
               wesm
        71.212121
    
    
      2011-10-31
       17
       118
               wesm
        66.101695
    
    
      2011-11-30
       25
       110
               wesm
        53.636364
    
    
      2011-12-31
       22
       125
               wesm
        49.600000
    
    
      2012-01-31
       41
       154
               wesm
        36.363636
    
    
      2012-02-29
       26
       101
               wesm
        48.514851
    
    
      2012-03-31
       47
       126
               wesm
        26.190476
    
    
      2012-04-30
       39
       171
               wesm
        54.970760
    
    
      2012-05-31
       26
       179
               wesm
        49.720670
    
    
      2012-06-30
       55
       169
               wesm
        28.402367
    
    
      2012-07-31
       52
       141
               wesm
        29.078014
    
    
      2012-08-31
       46
       102
               wesm
        13.725490
    
    
      2012-09-30
       53
       139
       changhiskhan
        12.949640
    
    
      2012-10-31
       47
       113
                y-p
        14.159292
    
    
      2012-11-30
       63
       216
                y-p
        24.537037
    
    
      2012-12-31
       66
       175
                y-p
        15.428571
    
    
      2013-01-31
       55
       130
            jreback
        13.076923
    
    
      2013-02-28
       56
       130
            jreback
        19.230769
    
    
      2013-03-31
       56
       212
            jreback
        36.320755
    
    
      2013-04-30
       49
       142
                y-p
        27.464789



In [111]:

    
data_df.columns









    Out[111]:





Index([u'assignee', u'body', u'closed_at', u'comments', u'comments_url', u'created_at', u'events_url', u'html_url', u'id', u'labels', u'labels_url', u'milestone', u'number', u'pull_request', u'state', u'title', u'updated_at', u'url', u'user'], dtype=object)



In [115]:

    
import numpy as np
from numpy import nan as NaN



In [116]:

    
NaN









    Out[116]:





nan



In [203]:

    
import pdb
import datetime
import json

import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from numpy import nan as NaN

# Functions defined
def user_extract(dict_in):
	''' Extract 'login' key's value '''''
	return dict_in['login']

def count(resample_in):
	''' Count the number of samples in resampled data'''
	return len(resample_in)

def distinct_user(resample_in):
	''' Count the number of distinct user in resampled data'''
	unique_in = resample_in.drop_duplicates()
	return len(unique_in)

def mean_day(resample_in):
	''' Calculate the mean open day of an issue in resampled data'''
	try:
		#Create a Series has value equals to created_at
		created_at_series = pd.Series(
			resample_in.index, index=resample_in.index)
		diff = resample_in - created_at_series
		sec_mean = diff.sum()/float(len(diff))
		#Convert to timedelta object
		td = datetime.timedelta(microseconds=int(sec_mean/1000.))		
		#return day attribute of the time difference 
		return td.days 
	except Exception:
		return NaN

def comment_func(comment_list):
	''' Extract the comment string from the comments column'''
	try:
		return comment_list[0]['text']
	except Exception:
		return NaN

def chattiest_user(resample_se):
	''' Find the chattiest user in a month'''
	try:
		counts = resample_se.value_counts().order(ascending=True)
		return counts.index[0]
	except Exception:
		return NaN

def distinct(resample_se):
	''' Find the number of distinct user who comment on 'pandas' '''
	return len(resample_se.unique())

def percent(resample_se):
	''' Find the percentage of comments provided by the chattiest user'''
	try:
		counts = resample_se.value_counts().order(ascending=False)
		return int(counts.ix[0]/float(counts.sum())*100)
	except Exception:
		return NaN


# Read and load data
json_file = open('closed.json')
json_data = json.load(json_file)
all_data_df = pd.DataFrame(json_data)

###########
# part(1) #
###########

# p1 is the dataframe have title, created_at, labels, closed_at, user, \
# id as columns
p1 = all_data_df[['title', 'created_at', 'labels', 'closed_at',\
					'user', 'id']]
# transfer the user values to username string
p1.user = p1.user.map(user_extract)

###########
# part(2) #
###########

# Drop the duplicate rows using id inplace
p1.drop_duplicates(cols='id', inplace=True)

###########
# part(4) #
###########

# Convert created_at and closed_at columns from string to datetime
p1['created_at'] = pd.to_datetime(p1['created_at'])
p1['closed_at'] = pd.to_datetime(p1['closed_at'])

###########
# part(5) #
###########

# Set 'created_at' as index
p1.set_index('created_at', inplace=True)
# Make the monthly number of issue plot
issue_month = p1.title.resample('M', how=count)
issue_month.name = 'Number of Issues'
plt.figure()
issue_ax = issue_month.plot()
issue_ax.set_ylabel(issue_month.name)
# Make the monthly distinct user number plot
distinct_month = p1.user.resample('M', how=distinct_user)
distinct_month.name = 'Number of Distinct User'
plt.figure()
distinct_ax = distinct_month.plot()
distinct_ax.set_ylabel(distinct_month.name)

###########
# part(6) #
###########

# Resample the closed_at and return a series of mean open day
open_day = p1.closed_at.resample('M', how=mean_day)
open_day.name = 'Mean Open Day'
# Concatenate monthly issue number with mean open day
open_day = pd.concat([issue_month, open_day], axis=1)
open_day.columns = ['nIssues', 'mean_days']
plt.figure()
line1 = open_day['nIssues'].plot()
line1.set_ylabel('Number of issues')
plt.figure()
line2 = open_day['mean_days'].plot()
line2.set_ylabel('Mean Open Day')
open_day.to_pickle('mean_day.pkl')
print '_'*80
print open_day.head(20)

###########
# part(7) #
###########

# Create the comment dateframe
comm_df = all_data_df[['created_at', 'comments', 'id']]
comm_df.drop_duplicates(cols='id', inplace=True)
comm_df.comments = comm_df.comments.map(comment_func)
comm_df.created_at = pd.to_datetime(comm_df.created_at)
comm_df.set_index('created_at', inplace=True)
print '_'*80
print comm_df.head(20)
comm_df.to_pickle('comments.pkl')

###########
# part(8) #
###########

# Create user dataframe
user_df = all_data_df[['created_at', 'user', 'id']]
user_df.user = user_df.user.map(user_extract)
user_df.created_at = pd.to_datetime(user_df.created_at)
user_df.set_index('created_at', inplace=True)
user_se = user_df['user']
chattiest_df = user_se.resample('M', how=[count,chattiest_user,percent,distinct])
chattiest_df.columns = ['Number of comments', 'The chattiest', 'Percentage of the chattiest(%)',\
						'Number of participants']
print '_'*80						
print chattiest_df.head(20)
chattiest_df.to_pickle('chattiest.pkl')

###########
# part(9) #
###########

# Create id_label dataframe with create time as index
id_labels_temp = data_df[['id', 'labels', 'created_at']]
id_labels_temp.drop_duplicates(cols='id', inplace=True)
id_labels_list = []
for (idx,Id) in id_labels_temp.id.iteritems():
    if len(id_labels_temp.labels.ix[idx]):
        for label_dict in id_labels_temp.labels.ix[idx]:
            id_labels_list.append((Id, label_dict['name'], id_labels_temp.created_at[idx]))
    else:
        id_labels_list.append((Id, np.nan,id_labels_temp.created_at[idx]))
id_labels_df = pd.DataFrame(id_labels_list, columns=['id', 'label', 'created_at'])
id_labels_df['created_at'] = pd.to_datetime(id_labels_df['created_at'])
id_labels_df.set_index('created_at', inplace=True)









    



________________________________________________________________________________
            nIssues  mean_days
created_at                    
2010-09-30       11        138
2010-10-31        8        250
2010-11-30        2         13
2010-12-31        4          2
2011-01-31        9         52
2011-02-28        2          3
2011-03-31        2          6
2011-04-30        0        NaN
2011-05-31        7         64
2011-06-30        9         49
2011-07-31       30         67
2011-08-31       31         53
2011-09-30       66         39
2011-10-31      118         23
2011-11-30      110         21
2011-12-31      125         26
2012-01-31      154         20
2012-02-29      101         23
2012-03-31      126         30
2012-04-30      171         13
________________________________________________________________________________
                                                              comments      id
created_at                                                                    
2010-09-29 00:45:31                            implemented in git HEAD  337721
2010-09-29 00:50:13                                               done  337726
2010-09-29 00:50:52  I guess I &quot;accidentally&quot; fixed this ...  337728
2010-09-29 00:51:27                                              fixed  337730
2010-09-29 00:57:00  Everything seems to be working in Python 2.7 w...  337736
2010-09-29 05:30:56  All fixed up and wrote unit tests--hopefully d...  337994
2010-09-29 15:41:55  This is a bug. DataMatrix as input to the Data...  338909
2010-09-29 19:45:47  In principle I agree with you that fill should...  339355
2010-09-30 22:29:36  A user suggested this version to start with:\r...  341577
2010-09-30 22:33:14                            Done in latest git HEAD  341581
2010-09-30 22:34:26  This will not go in forthcoming 0.3 release bu...  341583
2010-10-03 17:20:41  You make a good point, and I think it might be...  344725
2010-10-07 23:42:34  Hi Surbas,\r\n\r\nI'm sorry this has taken me ...  352369
2010-10-11 03:19:39                      implemented in recent commits  356064
2010-10-12 16:10:48  added apply and applymap functions to Series f...  358943
2010-10-12 16:13:04     Haven't been able to reproduce this so closing  358947
2010-10-12 16:13:55  Done for DataFrame / WidePanel. Needs to be ad...  358950
2010-10-12 16:15:10                                    duplicate issue  358952
2010-10-22 17:59:31                                  fixed in git HEAD  376890
2010-11-19 14:50:11  Should be safe to use the git HEAD, I will try...  428564
________________________________________________________________________________
            Number of comments The chattiest  Percentage of the chattiest(%)  \
created_at                                                                     
2010-09-30                  22       andylei                              81   
2010-10-31                  16      hector13                              75   
2010-11-30                   4          wesm                              50   
2010-12-31                   8      mpenning                              50   
2011-01-31                  18          wesm                              33   
2011-02-28                   2       tgefell                              50   
2011-03-31                   2         ghost                             100   
2011-04-30                   0           NaN                             NaN   
2011-05-31                   7        surbas                              71   
2011-06-30                   9     dieterv77                              77   
2011-07-31                  30       talltom                              73   
2011-08-31                  31         xdong                              67   
2011-09-30                  66       scottza                              71   
2011-10-31                 118  Komnomnomnom                              66   
2011-11-30                 110    algotr8der                              53   
2011-12-31                 125         MaxBo                              49   
2012-01-31                 154    fonnesbeck                              36   
2012-02-29                 101    yarikoptic                              48   
2012-03-31                 126        brentp                              26   
2012-04-30                 171        nspies                              54   

            Number of participants  
created_at                          
2010-09-30                       2  
2010-10-31                       3  
2010-11-30                       2  
2010-12-31                       3  
2011-01-31                       5  
2011-02-28                       2  
2011-03-31                       1  
2011-04-30                       0  
2011-05-31                       3  
2011-06-30                       3  
2011-07-31                       9  
2011-08-31                      10  
2011-09-30                      14  
2011-10-31                      17  
2011-11-30                      25  
2011-12-31                      22  
2012-01-31                      41  
2012-02-29                      26  
2012-03-31                      47  
2012-04-30                      39



In [124]:

    
id_labels_df.head()









    Out[124]:






  
    
      
      id
      label
      created_at
    
  
  
    
      0
       337721
       NaN
       2010-09-29T00:45:31Z
    
    
      1
       337726
       NaN
       2010-09-29T00:50:13Z
    
    
      2
       337728
       NaN
       2010-09-29T00:50:52Z
    
    
      3
       337730
       NaN
       2010-09-29T00:51:27Z
    
    
      4
       337736
       NaN
       2010-09-29T00:57:00Z



In [129]:

    
open_day.head()



In [132]:

    
id_labels_df.head()









    Out[132]:






  
    
      
      id
      label
    
    
      created_at
      
      
    
  
  
    
      2010-09-29 00:45:31
       337721
       NaN
    
    
      2010-09-29 00:50:13
       337726
       NaN
    
    
      2010-09-29 00:50:52
       337728
       NaN
    
    
      2010-09-29 00:51:27
       337730
       NaN
    
    
      2010-09-29 00:57:00
       337736
       NaN



In [133]:

    
p1.closed_at.head()









    Out[133]:





created_at
2010-09-29 00:45:31   2011-02-19 23:13:48
2010-09-29 00:50:13   2010-12-17 02:57:33
2010-09-29 00:50:52   2011-01-01 23:50:12
2010-09-29 00:51:27   2010-12-11 06:14:32
2010-09-29 00:57:00   2010-12-17 02:46:34
Name: closed_at, dtype: datetime64[ns]



In [139]:

    
id_labels_df.ix[60:100]









    Out[139]:






  
    
      
      id
      label
    
    
      created_at
      
      
    
  
  
    
      2011-07-18 15:37:46
       1242420
         Enhancement
    
    
      2011-07-18 15:37:46
       1242420
             Testing
    
    
      2011-07-18 15:39:18
       1242434
         Enhancement
    
    
      2011-07-18 15:39:18
       1242434
          timeseries
    
    
      2011-07-18 15:43:35
       1242459
         Enhancement
    
    
      2011-07-18 15:45:19
       1242473
                 Bug
    
    
      2011-07-18 15:46:39
       1242483
            Refactor
    
    
      2011-07-18 15:47:17
       1242492
         Enhancement
    
    
      2011-07-18 15:48:30
       1242501
         Enhancement
    
    
      2011-07-18 15:49:35
       1242511
         Enhancement
    
    
      2011-07-18 15:50:10
       1242517
         Enhancement
    
    
      2011-07-18 15:52:21
       1242529
         Enhancement
    
    
      2011-07-18 15:54:10
       1242542
         Enhancement
    
    
      2011-07-18 15:54:45
       1242545
             Testing
    
    
      2011-07-18 16:02:37
       1242597
         Enhancement
    
    
      2011-07-18 21:00:39
       1245597
                 Bug
    
    
      2011-07-21 19:32:25
       1265368
       Build problem
    
    
      2011-07-24 20:08:48
       1278525
         Enhancement
    
    
      2011-07-26 01:42:29
       1286029
                 NaN
    
    
      2011-07-26 15:41:00
       1289636
       Build problem
    
    
      2011-07-27 23:02:15
       1299665
                 Bug
    
    
      2011-07-28 15:04:34
       1303422
                 NaN
    
    
      2011-07-29 15:53:40
       1310939
                 NaN
    
    
      2011-07-29 16:26:50
       1311144
         Enhancement
    
    
      2011-07-29 16:26:50
       1311144
          timeseries
    
    
      2011-07-29 22:19:39
       1313137
         Enhancement
    
    
      2011-07-30 01:01:42
       1313688
         Enhancement
    
    
      2011-08-01 20:13:57
       1325504
                 NaN
    
    
      2011-08-05 15:46:35
       1351978
                Docs
    
    
      2011-08-06 08:02:19
       1355531
                 NaN
    
    
      2011-08-07 02:54:27
       1359677
                 NaN
    
    
      2011-08-07 19:42:03
       1361586
         Enhancement
    
    
      2011-08-07 20:57:42
       1361833
         Enhancement
    
    
      2011-08-07 21:01:30
       1361845
         Enhancement
    
    
      2011-08-07 21:08:46
       1361862
         Enhancement
    
    
      2011-08-09 02:25:36
       1369747
                 NaN
    
    
      2011-08-09 17:29:47
       1373938
         Enhancement
    
    
      2011-08-09 18:25:40
       1374309
                 NaN
    
    
      2011-08-09 20:53:42
       1375374
                 Bug
    
    
      2011-08-09 22:51:30
       1376176
         Enhancement



In [135]:

    
p1.head()









    Out[135]:






  
    
      
      title
      labels
      closed_at
      user
      id
    
    
      created_at
      
      
      
      
      
    
  
  
    
      2010-09-29 00:45:31
       Enable element-wise comparison operations in D...
       []
      2011-02-19 23:13:48
       wesm
       337721
    
    
      2010-09-29 00:50:13
                                   reindex_like function
       []
      2010-12-17 02:57:33
       wesm
       337726
    
    
      2010-09-29 00:50:52
                     Binary operations on int DataMatrix
       []
      2011-01-01 23:50:12
       wesm
       337728
    
    
      2010-09-29 00:51:27
       Plot keyword arguments are unused in DataFrame...
       []
      2010-12-11 06:14:32
       wesm
       337730
    
    
      2010-09-29 00:57:00
                                      Python 2.7 testing
       []
      2010-12-17 02:46:34
       wesm
       337736



In [141]:

    
def open_day_per_issue(series_in):
	''' Calculate the days an issue takes to be solved '''
	created_at_series = pd.Series(series_in.index, index=series_in.index)
	diff = series_in - created_at_series
	return float(diff.days)



In [147]:

    
open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-147-014c63418d2e> in <module>()
----> 1 open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2534             values = lib.map_infer(values, lib.Timestamp)
   2535 
-> 2536         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2537         if isinstance(mapped[0], Series):
   2538             from pandas.core.frame import DataFrame

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

<ipython-input-141-838498043c83> in open_day_per_issue(series_in)
      1 def open_day_per_issue(series_in):
      2         ''' Calculate the days an issue takes to be solved '''
----> 3         created_at_series = pd.Series(series_in.index, index=series_in.index)
      4         diff = series_in - created_at_series
      5         return float(diff.days)

AttributeError: 'Timestamp' object has no attribute 'index'





    



> <ipython-input-141-838498043c83>(3)open_day_per_issue()
      2         ''' Calculate the days an issue takes to be solved '''
----> 3         created_at_series = pd.Series(series_in.index, index=series_in.index)
      4         diff = series_in - created_at_series

ipdb> series_in
Timestamp('2011-02-19 23:13:48', tz=None)
ipdb> d
*** Newest frame
ipdb> u
> /Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.py(2536)apply()
   2535 
-> 2536         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2537         if isinstance(mapped[0], Series):

ipdb> u
> <ipython-input-147-014c63418d2e>(1)<module>()
----> 1 open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)

ipdb> u
*** Oldest frame
ipdb> d
> /Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.py(2536)apply()
   2535 
-> 2536         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2537         if isinstance(mapped[0], Series):

ipdb> d
> <ipython-input-141-838498043c83>(3)open_day_per_issue()
      2         ''' Calculate the days an issue takes to be solved '''
----> 3         created_at_series = pd.Series(series_in.index, index=series_in.index)
      4         diff = series_in - created_at_series

ipdb> exit



In [143]:

    
type(p1.closed_at)









    Out[143]:





pandas.core.series.TimeSeries



In [144]:

    
p1.closed_at.index









    Out[144]:





<class 'pandas.tseries.index.DatetimeIndex'>
[2010-09-29 00:45:31, ..., 2013-04-28 15:27:23]
Length: 2934, Freq: None, Timezone: None



In [145]:

    
%pdb









    



Automatic pdb calling has been turned ON



In [149]:

    
user.ix[0][1].index









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-149-53ba29158ff5> in <module>()
----> 1 user.ix[0][1].index

AttributeError: 'numpy.int64' object has no attribute 'index'





    



> <ipython-input-149-53ba29158ff5>(1)<module>()
----> 1 user.ix[0][1].index

ipdb> exit



In [151]:

    
p1_reset = p1.reset_index()



In [152]:

    
p1_reset.head()









    Out[152]:






  
    
      
      created_at
      title
      labels
      closed_at
      user
      id
    
  
  
    
      0
      2010-09-29 00:45:31
       Enable element-wise comparison operations in D...
       []
      2011-02-19 23:13:48
       wesm
       337721
    
    
      1
      2010-09-29 00:50:13
                                   reindex_like function
       []
      2010-12-17 02:57:33
       wesm
       337726
    
    
      2
      2010-09-29 00:50:52
                     Binary operations on int DataMatrix
       []
      2011-01-01 23:50:12
       wesm
       337728
    
    
      3
      2010-09-29 00:51:27
       Plot keyword arguments are unused in DataFrame...
       []
      2010-12-11 06:14:32
       wesm
       337730
    
    
      4
      2010-09-29 00:57:00
                                      Python 2.7 testing
       []
      2010-12-17 02:46:34
       wesm
       337736



In [161]:

    
(p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-161-b15a1a9d3ddc> in <module>()
----> 1 (p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())

TypeError: descriptor 'total_seconds' of 'datetime.timedelta' object needs an argument





    



> <ipython-input-161-b15a1a9d3ddc>(1)<module>()
----> 1 (p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())

ipdb> exit



In [169]:

    
a = -(p1_reset['created_at']-p1_reset['closed_at'])



In [164]:

    
a.map(datetime.timedelta.total_seconds)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-164-65772c304ca2> in <module>()
----> 1 a.map(datetime.timedelta.total_seconds)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

TypeError: descriptor 'total_seconds' requires a 'datetime.timedelta' object but received a 'numpy.timedelta64'





    



> /Users/Yigong/Documents/Python/AY250/hw7/inference.pyx(864)pandas.lib.map_infer (pandas/lib.c:42840)()

ipdb> exit



In [167]:

    
a.ix[0][1]









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-167-f5444cee41aa> in <module>()
----> 1 a.ix[0][1]

IndexError: invalid index to scalar variable.





    



> <ipython-input-167-f5444cee41aa>(1)<module>()
----> 1 a.ix[0][1]

ipdb> exit



In [171]:

    
a.ix[0]









    Out[171]:





numpy.timedelta64(12436097000000000,'ns')



In [173]:

    
p1_reset









    Out[173]:




<class 'pandas.core.frame.DataFrame'>
Int64Index: 2934 entries, 0 to 2933
Data columns (total 6 columns):
created_at    2934  non-null values
title         2934  non-null values
labels        2934  non-null values
closed_at     2934  non-null values
user          2934  non-null values
id            2934  non-null values
dtypes: datetime64[ns](2), int64(1), object(3)



In [174]:

    
created_at = p1_reset['created_at']
closed_at = p1_reset['closed_at']



In [175]:

    
time_diff = closed_at - created_at



In [176]:

    
time_diff.head()









    Out[176]:





0   143 days, 22:28:17
1    79 days, 02:07:20
2    94 days, 22:59:20
3    73 days, 05:23:05
4    79 days, 01:49:34
dtype: timedelta64[ns]



In [177]:

    
time_diff.map(datetime.timedelta.total_seconds)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-177-074ab4b0a9de> in <module>()
----> 1 time_diff.map(datetime.timedelta.total_seconds)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

TypeError: descriptor 'total_seconds' requires a 'datetime.timedelta' object but received a 'numpy.timedelta64'





    



> /Users/Yigong/Documents/Python/AY250/hw7/inference.pyx(864)pandas.lib.map_infer (pandas/lib.c:42840)()

ipdb> exit



In [178]:

    
x = np.timedelta64(2069211000000000, 'ns')
>>> days = x.astype('timedelta64[D]')



In [179]:

    
days









    Out[179]:





numpy.timedelta64(23,'D')



In [181]:

    
a.map(lambda td : td.days)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-181-dfe5f4c2ec11> in <module>()
----> 1 a.map(lambda td : td.days)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

<ipython-input-181-dfe5f4c2ec11> in <lambda>(td)
----> 1 a.map(lambda td : td.days)

AttributeError: 'numpy.timedelta64' object has no attribute 'days'





    



> <ipython-input-181-dfe5f4c2ec11>(1)<lambda>()
----> 1 a.map(lambda td : td.days)

ipdb> eit
*** NameError: name 'eit' is not defined
ipdb> exit



In [189]:

    
t = a[0]



In [190]:

    
t









    Out[190]:





numpy.timedelta64(12436097000000000,'ns')



In [191]:

    
24*60*60*1e9









    Out[191]:





86400000000000.0



In [192]:

    
created = pd.Series(p1.index, index=p1.index)
closed = p1.closed_at



In [195]:

    
time_diff = closed - created



In [196]:

    
time_diff.head()









    Out[196]:





created_at
2010-09-29 00:45:31   143 days, 22:28:17
2010-09-29 00:50:13    79 days, 02:07:20
2010-09-29 00:50:52    94 days, 22:59:20
2010-09-29 00:51:27    73 days, 05:23:05
2010-09-29 00:57:00    79 days, 01:49:34
dtype: timedelta64[ns]



In [204]:

    
created = pd.Series(p1.index, index=p1.index)
closed = p1.closed_at
time_diff = closed - created
SEC_DAY = 24*60*60*1e9
open_day_per_issue = time_diff.map(lambda td: td/SEC_DAY)
open_day_per_issue.name = 'open_day'
print open_day_per_issue.head()









    



created_at
2010-09-29 00:45:31    143
2010-09-29 00:50:13     79
2010-09-29 00:50:52     94
2010-09-29 00:51:27     73
2010-09-29 00:57:00     79
Name: open_day, dtype: int64



In [ ]:

    
open_day_per_issue



In [199]:

    
id_labels_df.head()









    Out[199]:






  
    
      
      id
      label
    
    
      created_at
      
      
    
  
  
    
      2010-09-29 00:45:31
       337721
       NaN
    
    
      2010-09-29 00:50:13
       337726
       NaN
    
    
      2010-09-29 00:50:52
       337728
       NaN
    
    
      2010-09-29 00:51:27
       337730
       NaN
    
    
      2010-09-29 00:57:00
       337736
       NaN



In [219]:

    
label_open_df = pd.merge(id_labels_df.reset_index(), open_day_per_issue.reset_index(), on = 'created_at')



In [236]:

    
label_open_df.ix[label_open_df['label']=='Bug']['open_day']









    Out[236]:





7      266
17     347
43      33
46      11
58       2
65      12
75       1
80       3
98       0
100      9
107      3
108     14
113     11
121      3
128      1
...
3716    0
3719    0
3737    9
3765    4
3771    3
3778    2
3785    0
3791    0
3795    0
3798    0
3803    0
3806    0
3807    0
3811    0
3816    0
Name: open_day, Length: 956, dtype: int64



In [231]:

    
a = label_open_df['label'].drop_duplicates()



In [235]:

    
a.values









    Out[235]:





array([nan, u'Bug', u'Enhancement', u'Testing', u'timeseries', u'Refactor',
       u'Build problem', u'Docs', u'groupby', u'Ideas', u'unicode',
       u'Data IO', u'prio-high', u'prio-medium', u'prio-low',
       u'Visualization', u'Community', u'missing-data', u'Stats',
       u'Indexing', u'Output-Formatting', u"Can't Repro", u'Performance',
       u'Reshaping', u'Multithreading', u'Dtypes', u'Good as first PR',
       u'API', u'Note To Selves', u'Regression', u'Usage'], dtype=object)



In [212]:

    
open_day_per_issue.reset_index()









    Out[212]:




<class 'pandas.core.frame.DataFrame'>
Int64Index: 2934 entries, 0 to 2933
Data columns (total 2 columns):
created_at    2934  non-null values
open_day      2934  non-null values
dtypes: datetime64[ns](1), int64(1)



In [269]:

    
label_open_df = pd.merge(id_labels_df.reset_index(), open_day_per_issue.reset_index(),\
		 on='created_at')
label_open_df.set_index('created_at', inplace=True)
bug_df = label_open_df.ix[label_open_df['label']=='Bug']
LABELS = label_open_df['label'].drop_duplicates().values[1:]
time_per_label = pd.DataFrame([])
for label_str in LABELS:
    temp = label_open_df.ix[label_open_df['label']==label_str]['open_day']
    label_month = temp.resample('M', how='mean')
    label_month.name = label_str
    time_per_label = time_per_label.join(label_month, how='outer')
print time_per_label.head()









    



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2010-09-30 00:00:00 to 2011-01-31 00:00:00
Freq: M
Data columns (total 30 columns):
Bug                  2  non-null values
Enhancement          1  non-null values
Testing              0  non-null values
timeseries           0  non-null values
Refactor             0  non-null values
Build problem        0  non-null values
Docs                 0  non-null values
groupby              0  non-null values
Ideas                0  non-null values
unicode              0  non-null values
Data IO              0  non-null values
prio-high            0  non-null values
prio-medium          0  non-null values
prio-low             0  non-null values
Visualization        0  non-null values
Community            0  non-null values
missing-data         0  non-null values
Stats                0  non-null values
Indexing             0  non-null values
Output-Formatting    0  non-null values
Can't Repro          0  non-null values
Performance          0  non-null values
Reshaping            0  non-null values
Multithreading       0  non-null values
Dtypes               0  non-null values
Good as first PR     0  non-null values
API                  0  non-null values
Note To Selves       0  non-null values
Regression           0  non-null values
Usage                0  non-null values
dtypes: float64(30), object(0)



In [273]:

    
time_per_label.ix[:5,:5]









    Out[273]:






  
    
      
      Bug
      Enhancement
      Testing
      timeseries
      Refactor
    
    
      created_at
      
      
      
      
      
    
  
  
    
      2010-09-30
       266
         NaN
      NaN
      NaN
      NaN
    
    
      2010-10-31
       347
       532.5
      NaN
      NaN
      NaN
    
    
      2010-11-30
       NaN
         NaN
      NaN
      NaN
      NaN
    
    
      2010-12-31
       NaN
         NaN
      NaN
      NaN
      NaN
    
    
      2011-01-31
       NaN
         NaN
      NaN
      NaN
      NaN



In [281]:

    
time_per_label[['Bug', 'Enhancement']].plot(label=['a','b'])









    Out[281]:





<matplotlib.axes.AxesSubplot at 0x115f9b350>



In [277]:

    
f1 = plt.figure()



In [279]:

    
ax1 = f1.add_axes



In [245]:

    
label_month









    Out[245]:





created_at
2010-09-30    266.000000
2010-10-31    347.000000
2010-11-30           NaN
2010-12-31           NaN
2011-01-31           NaN
2011-02-28           NaN
2011-03-31           NaN
2011-04-30           NaN
2011-05-31     33.000000
2011-06-30     11.000000
2011-07-31      4.500000
2011-08-31      7.400000
2011-09-30     16.058824
2011-10-31      0.652174
2011-11-30     12.076923
2011-12-31     24.357143
2012-01-31     11.574468
2012-02-29     17.578947
2012-03-31     15.515152
2012-04-30     13.655738
2012-05-31      5.333333
2012-06-30      5.454545
2012-07-31     10.657534
2012-08-31     19.072727
2012-09-30     16.358491
2012-10-31     18.111111
2012-11-30      5.140845
2012-12-31      8.125000
2013-01-31     25.235294
2013-02-28     25.666667
2013-03-31      4.081633
2013-04-30      1.090909
Freq: M, Name: Bug, dtype: float64



In [240]:

    
time_per_label = pd.DataFrame([], index=label_open_df.index)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-240-0aec6ab49ba4> in <module>()
----> 1 time_per_label = pd.DataFrame([], index=label_open_df.index)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    432             else:
    433                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 434                                          copy=copy)
    435         else:
    436             try:

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_ndarray(self, values, index, columns, dtype, copy)
    559             columns = _ensure_index(columns)
    560 
--> 561         return create_block_manager_from_blocks([ values.T ], [ columns, index ])
    562 
    563     def _wrap_array(self, arr, axes, copy=False):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_blocks(blocks, axes)
   2232         blocks = [ getattr(b,'values',b) for b in blocks ]
   2233         tot_items = sum(b.shape[0] for b in blocks)
-> 2234         construction_error(tot_items,blocks[0].shape[1:],axes)
   2235 
   2236 def create_block_manager_from_arrays(arrays, names, axes):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in construction_error(tot_items, block_shape, axes)
   2214     raise ValueError("Shape of passed values is %s, indices imply %s" % (
   2215             tuple(map(int, [tot_items] + list(block_shape))),
-> 2216             tuple(map(int, [len(ax) for ax in axes]))))
   2217 
   2218 

ValueError: Shape of passed values is (0, 0), indices imply (0, 3825)





    



> /Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.py(2216)construction_error()
   2215             tuple(map(int, [tot_items] + list(block_shape))),
-> 2216             tuple(map(int, [len(ax) for ax in axes]))))
   2217 

ipdb> exit



In [268]:

    
for label_str in LABELS:
    temp = label_open_df.ix[label_open_df['label']==label_str]['open_day']
    label_month = temp.resample('M', how='mean')
    label_month.name = label_str
    time_per_label = time_per_label.join(label_month, how='outer')
print time_per_label.head()









    



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2010-09-30 00:00:00 to 2011-01-31 00:00:00
Freq: M
Data columns (total 30 columns):
Bug                  2  non-null values
Enhancement          1  non-null values
Testing              0  non-null values
timeseries           0  non-null values
Refactor             0  non-null values
Build problem        0  non-null values
Docs                 0  non-null values
groupby              0  non-null values
Ideas                0  non-null values
unicode              0  non-null values
Data IO              0  non-null values
prio-high            0  non-null values
prio-medium          0  non-null values
prio-low             0  non-null values
Visualization        0  non-null values
Community            0  non-null values
missing-data         0  non-null values
Stats                0  non-null values
Indexing             0  non-null values
Output-Formatting    0  non-null values
Can't Repro          0  non-null values
Performance          0  non-null values
Reshaping            0  non-null values
Multithreading       0  non-null values
Dtypes               0  non-null values
Good as first PR     0  non-null values
API                  0  non-null values
Note To Selves       0  non-null values
Regression           0  non-null values
Usage                0  non-null values
dtypes: float64(30), object(0)



In [282]:

    
!README.md









    



/bin/sh: README.md: command not found



In [283]:

    
ls









    



11_pandas.ipynb                   hw_7_pandas_and_timeseries.ipynb
12_pandas-Copy0.ipynb             hw_7_pandas_and_timeseries.py
12_pandas.ipynb                   legit.ipynb
Lecture.ipynb                     mean_day.pkl
README.md                         question.ipynb
SFHousing.csv                     test.ipynb
chattiest.pkl                     test2.ipynb
closed.json                       test_1.py
comments.pkl                      test_book.ipynb
fred_fx.csv                       time_label.pkl
hw7.py

HW7

Everything is included in hw7.py. So all you need to do is typing this in the command line: python hw7.py

HW7

Everything is included in hw7.py. So all you need to do is typing this in the command line: python hw7.py

HW7

Everything is included in hw7.py. So all you need to do is typing this in the command line: python hw7.py

Tables for 6), 7), 8), 11) are mean_day.pkl, comments.pkl, chattiest.pkl and time_label.pkl, respectively.



In [ ]:

	title	created_at	labels	closed_at	id	user
created_at
2010-09-29 00:45:31	Enable element-wise comparison operations in D...	2010-09-29 00:45:31	[]	2011-02-19 23:13:48	337721	wesm
2010-09-29 00:50:13	reindex_like function	2010-09-29 00:50:13	[]	2010-12-17 02:57:33	337726	wesm
2010-09-29 00:50:52	Binary operations on int DataMatrix	2010-09-29 00:50:52	[]	2011-01-01 23:50:12	337728	wesm
2010-09-29 00:51:27	Plot keyword arguments are unused in DataFrame...	2010-09-29 00:51:27	[]	2010-12-11 06:14:32	337730	wesm
2010-09-29 00:57:00	Python 2.7 testing	2010-09-29 00:57:00	[]	2010-12-17 02:46:34	337736	wesm

	Number of issues	mean_days
created_at
2010-09-30	11	138
2010-10-31	8	250
2010-11-30	2	13
2010-12-31	4	2
2011-01-31	9	52

	created_at	comments	id
0	2010-09-29T00:45:31Z	implemented in git HEAD	337721
2	2010-09-29T00:50:13Z	done	337726
4	2010-09-29T00:50:52Z	I guess I "accidentally" fixed this ...	337728
6	2010-09-29T00:51:27Z	fixed	337730
8	2010-09-29T00:57:00Z	Everything seems to be working in Python 2.7 w...	337736

	distinct	percent	chattiest_user	count
created_at
2010-09-30	2	0.818182	wesm	22
2010-10-31	3	0.750000	wesm	16
2010-11-30	2	0.500000	mpenning	4
2010-12-31	3	0.500000	knm	8
2011-01-31	5	0.333333	triplechess	18
2011-02-28	2	0.500000	ghost	2
2011-03-31	1	1.000000	ghost	2
2011-04-30	0	NaN	NaN	0
2011-05-31	3	0.714286	wesm	7
2011-06-30	3	0.777778	wesm	9
2011-07-31	9	0.733333	wesm	30
2011-08-31	10	0.677419	wesm	31
2011-09-30	14	0.712121	wesm	66
2011-10-31	17	0.661017	wesm	118
2011-11-30	25	0.536364	wesm	110
2011-12-31	22	0.496000	wesm	125
2012-01-31	41	0.363636	wesm	154
2012-02-29	26	0.485149	wesm	101
2012-03-31	47	0.261905	wesm	126
2012-04-30	39	0.549708	wesm	171
2012-05-31	26	0.497207	wesm	179
2012-06-30	55	0.284024	wesm	169
2012-07-31	52	0.290780	wesm	141
2012-08-31	46	0.137255	wesm	102
2012-09-30	53	0.129496	changhiskhan	139
2012-10-31	47	0.141593	y-p	113
2012-11-30	63	0.245370	y-p	216
2012-12-31	66	0.154286	y-p	175
2013-01-31	55	0.130769	jreback	130
2013-02-28	56	0.192308	jreback	130
2013-03-31	56	0.363208	jreback	212
2013-04-30	49	0.274648	y-p	142

	id	label
created_at
2011-07-18 15:37:46	1242420	Enhancement
2011-07-18 15:37:46	1242420	Testing
2011-07-18 15:39:18	1242434	Enhancement
2011-07-18 15:39:18	1242434	timeseries
2011-07-18 15:43:35	1242459	Enhancement
2011-07-18 15:45:19	1242473	Bug
2011-07-18 15:46:39	1242483	Refactor
2011-07-18 15:47:17	1242492	Enhancement
2011-07-18 15:48:30	1242501	Enhancement
2011-07-18 15:49:35	1242511	Enhancement
2011-07-18 15:50:10	1242517	Enhancement
2011-07-18 15:52:21	1242529	Enhancement
2011-07-18 15:54:10	1242542	Enhancement
2011-07-18 15:54:45	1242545	Testing
2011-07-18 16:02:37	1242597	Enhancement
2011-07-18 21:00:39	1245597	Bug
2011-07-21 19:32:25	1265368	Build problem
2011-07-24 20:08:48	1278525	Enhancement
2011-07-26 01:42:29	1286029	NaN
2011-07-26 15:41:00	1289636	Build problem
2011-07-27 23:02:15	1299665	Bug
2011-07-28 15:04:34	1303422	NaN
2011-07-29 15:53:40	1310939	NaN
2011-07-29 16:26:50	1311144	Enhancement
2011-07-29 16:26:50	1311144	timeseries
2011-07-29 22:19:39	1313137	Enhancement
2011-07-30 01:01:42	1313688	Enhancement
2011-08-01 20:13:57	1325504	NaN
2011-08-05 15:46:35	1351978	Docs
2011-08-06 08:02:19	1355531	NaN
2011-08-07 02:54:27	1359677	NaN
2011-08-07 19:42:03	1361586	Enhancement
2011-08-07 20:57:42	1361833	Enhancement
2011-08-07 21:01:30	1361845	Enhancement
2011-08-07 21:08:46	1361862	Enhancement
2011-08-09 02:25:36	1369747	NaN
2011-08-09 17:29:47	1373938	Enhancement
2011-08-09 18:25:40	1374309	NaN
2011-08-09 20:53:42	1375374	Bug
2011-08-09 22:51:30	1376176	Enhancement