In [75]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import pdb
import numpy as np
import datetime
file_json = open('closed.json')
data_json = json.load(file_json)
data_df = pd.DataFrame(data_json, copy=True)
git_df_temp = data_df[['title', 'created_at', 'labels', 'closed_at', 'id']]
git_user_temp = data_df['user']
value_list = []
for i, row_entry in git_user_temp.iteritems():
    val = row_entry['login']
    value_list.append(val)
user_df = pd.Series(value_list,index=git_user_temp.index)
user_df.name = 'user'
git_df = git_df_temp.join(user_df)

In [110]:
data_df.user.head()


Out[110]:
0    {u'following_url': u'https://api.github.com/us...
1    {u'following_url': u'https://api.github.com/us...
2    {u'following_url': u'https://api.github.com/us...
3    {u'following_url': u'https://api.github.com/us...
4    {u'following_url': u'https://api.github.com/us...
Name: user, dtype: object

Question


In [90]:
git_df


Out[90]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2968 entries, 0 to 2967
Data columns (total 6 columns):
title         2968  non-null values
created_at    2968  non-null values
labels        2968  non-null values
closed_at     2968  non-null values
id            2968  non-null values
user          2968  non-null values
dtypes: int64(1), object(5)

In [40]:
git_df_2 = pd.concat([git_df_temp,user_df],axis=1)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-40-c1fcf8b76128> in <module>()
----> 1 git_df_2 = pd.concat([git_df_temp,user_df],axis=1)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity)
    876                        ignore_index=ignore_index, join=join,
    877                        keys=keys, levels=levels, names=names,
--> 878                        verify_integrity=verify_integrity)
    879     return op.get_result()
    880 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity)
    952         self.verify_integrity = verify_integrity
    953 
--> 954         self.new_axes = self._get_new_axes()
    955 
    956     def get_result(self):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
   1129                 if i == self.axis:
   1130                     continue
-> 1131                 new_axes[i] = self._get_comb_axis(i)
   1132         else:
   1133             if not ((len(self.join_axes) == ndim - 1)):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
   1154             all_indexes = [x.index for x in self.objs]
   1155         else:
-> 1156             all_indexes = [x._data.axes[i] for x in self.objs]
   1157 
   1158         return _get_combined_index(all_indexes, intersect=self.intersect)

AttributeError: 'Series' object has no attribute '_data'

In [76]:
git_df.drop_duplicates(cols='id',inplace=True)
git_df['created_at'] = pd.to_datetime(git_df['created_at'])
git_df['closed_at'] = pd.to_datetime(git_df['closed_at'])
git_df.set_index('created_at', inplace=True, drop=False)

(5)


In [82]:
git_df.head()


Out[82]:
title created_at labels closed_at id user
created_at
2010-09-29 00:45:31 Enable element-wise comparison operations in D... 2010-09-29 00:45:31 [] 2011-02-19 23:13:48 337721 wesm
2010-09-29 00:50:13 reindex_like function 2010-09-29 00:50:13 [] 2010-12-17 02:57:33 337726 wesm
2010-09-29 00:50:52 Binary operations on int DataMatrix 2010-09-29 00:50:52 [] 2011-01-01 23:50:12 337728 wesm
2010-09-29 00:51:27 Plot keyword arguments are unused in DataFrame... 2010-09-29 00:51:27 [] 2010-12-11 06:14:32 337730 wesm
2010-09-29 00:57:00 Python 2.7 testing 2010-09-29 00:57:00 [] 2010-12-17 02:46:34 337736 wesm

In [73]:
def count(created_in):
    return len(created_in)

In [68]:
git_df.set_index('created_at', inplace = True)

In [72]:
git_df.


Out[72]:
created_at
2010-09-29T00:45:31Z    Enable element-wise comparison operations in D...
2010-09-29T00:45:31Z    Enable element-wise comparison operations in D...
2010-09-29T00:50:13Z                                reindex_like function
2010-09-29T00:50:13Z                                reindex_like function
2010-09-29T00:50:52Z                  Binary operations on int DataMatrix
Name: title, dtype: object

In [79]:
issue_df = git_df.title.resample('M', how= count)
issue_df.name = 'Number of issues'
plt.figure()
issue_ax = issue_df.plot()
issue_ax.set_ylabel(issue_df.name)


Out[79]:
<matplotlib.text.Text at 0x10eea2790>

In [78]:
%matplotlib


Using matplotlib backend: MacOSX

In [85]:
def distinct_user(resample_input):
    #print resample_input.user
    distinct_input = resample_input.drop_duplicates()
    #print distinct_input.user
    return len(distinct_input.index)
user_monthly = git_df.user.resample('M', how = distinct_user)
user_monthly.name = 'Number of distinct user'
plt.figure()
user_ax = user_monthly.plot()
user_ax.set_ylabel(user_monthly.name)


Out[85]:
<matplotlib.text.Text at 0x10eea2950>

In [86]:
import datetime

In [87]:
def mean_day(close_at_input):
    try:
        create_at_series = pd.Series(close_at_input.index, index = close_at_input.index, name = close_at_input.index.name)
        diff = (close_at_input - create_at_series)
        sec_mean = diff.sum()/(len(diff))
    #pdb.set_trace()
        return (datetime.timedelta(microseconds = int(sec_mean/1000))).days
    except:
        return np.nan
    
open_day = git_df.closed_at.resample('M', mean_day)
open_day.name = 'mean_days'
open_day_nissue = pd.concat([issue_df, open_day], axis=1)
plt.figure()
line1 = open_day_nissue['Number of issues'].plot()
line1.set_ylabel('Number of issues')
plt.figure()
line2 = open_day_nissue['mean_days'].plot()
line2.set_ylabel('mean_days')


Out[87]:
<matplotlib.text.Text at 0x113711c50>

Table for (6)


In [88]:
open_day_nissue.head()


Out[88]:
Number of issues mean_days
created_at
2010-09-30 11 138
2010-10-31 8 250
2010-11-30 2 13
2010-12-31 4 2
2011-01-31 9 52

P(7)


In [79]:
comments_df = data_df[['created_at','comments',  'id' ]]
#print comments_df.head()
comments_df.drop_duplicates(cols='id', inplace=True)

In [50]:
comments_df.comments[100][0]['text']


Out[50]:
u'duplicate issue'

In [80]:
def comment_func(comment_list):
    try:
        return comment_list[0]['text']
    except:
        return np.nan
comments_df.comments = comments_df.comments.map(comment_func)

In [81]:
comments_df.head()


Out[81]:
created_at comments id
0 2010-09-29T00:45:31Z implemented in git HEAD 337721
2 2010-09-29T00:50:13Z done 337726
4 2010-09-29T00:50:52Z I guess I &quot;accidentally&quot; fixed this ... 337728
6 2010-09-29T00:51:27Z fixed 337730
8 2010-09-29T00:57:00Z Everything seems to be working in Python 2.7 w... 337736

p(8)


In [2]:
chattiest_df = git_df[['created_at', 'user', 'id']]

In [3]:
chattiest_df.head()


Out[3]:
created_at user id
0 2010-09-29T00:45:31Z wesm 337721
1 2010-09-29T00:45:31Z wesm 337721
2 2010-09-29T00:50:13Z wesm 337726
3 2010-09-29T00:50:13Z wesm 337726
4 2010-09-29T00:50:52Z wesm 337728

In [4]:
chattiest_df['created_at'] = pd.to_datetime(chattiest_df['created_at'])
chattiest_df.set_index('created_at', inplace=True)

In [5]:
chattiest_df.head()


Out[5]:
user id
created_at
2010-09-29 00:45:31 wesm 337721
2010-09-29 00:45:31 wesm 337721
2010-09-29 00:50:13 wesm 337726
2010-09-29 00:50:13 wesm 337726
2010-09-29 00:50:52 wesm 337728

In [100]:
del chattiest_df['id']

In [102]:
chattiest_df.head()


Out[102]:
user
created_at
2010-09-29 00:45:31 wesm
2010-09-29 00:45:31 wesm
2010-09-29 00:50:13 wesm
2010-09-29 00:50:13 wesm
2010-09-29 00:50:52 wesm

In [6]:
chattiest_se = chattiest_df['user']

In [8]:
def distinct(in_df):
    return len(in_df.unique())
nDistinct_se = chattiest_se.resample('M', how = distinct)
nDistinct_se.head()


Out[8]:
created_at
2010-09-30    2
2010-10-31    3
2010-11-30    2
2010-12-31    3
2011-01-31    5
Freq: M, dtype: int64

In [112]:
chattiest_se.head()


Out[112]:
created_at
2010-09-29 00:45:31    wesm
2010-09-29 00:45:31    wesm
2010-09-29 00:50:13    wesm
2010-09-29 00:50:13    wesm
2010-09-29 00:50:52    wesm
Name: user, dtype: object

In [113]:
chattiest_se.resample('M', how = [distinct, percent, chattiest_user, count])


Out[113]:
distinct percent chattiest_user count
created_at
2010-09-30 2 0.818182 wesm 22
2010-10-31 3 0.750000 wesm 16
2010-11-30 2 0.500000 mpenning 4
2010-12-31 3 0.500000 knm 8
2011-01-31 5 0.333333 triplechess 18
2011-02-28 2 0.500000 ghost 2
2011-03-31 1 1.000000 ghost 2
2011-04-30 0 NaN NaN 0
2011-05-31 3 0.714286 wesm 7
2011-06-30 3 0.777778 wesm 9
2011-07-31 9 0.733333 wesm 30
2011-08-31 10 0.677419 wesm 31
2011-09-30 14 0.712121 wesm 66
2011-10-31 17 0.661017 wesm 118
2011-11-30 25 0.536364 wesm 110
2011-12-31 22 0.496000 wesm 125
2012-01-31 41 0.363636 wesm 154
2012-02-29 26 0.485149 wesm 101
2012-03-31 47 0.261905 wesm 126
2012-04-30 39 0.549708 wesm 171
2012-05-31 26 0.497207 wesm 179
2012-06-30 55 0.284024 wesm 169
2012-07-31 52 0.290780 wesm 141
2012-08-31 46 0.137255 wesm 102
2012-09-30 53 0.129496 changhiskhan 139
2012-10-31 47 0.141593 y-p 113
2012-11-30 63 0.245370 y-p 216
2012-12-31 66 0.154286 y-p 175
2013-01-31 55 0.130769 jreback 130
2013-02-28 56 0.192308 jreback 130
2013-03-31 56 0.363208 jreback 212
2013-04-30 49 0.274648 y-p 142

In [29]:
def percent(in_se):
    try:
        counts = in_se.value_counts().order(ascending = False)
        return counts.ix[0]/float(counts.sum())
    except:
        return np.nan
percent_se = 100*chattiest_se.resample('M', how = percent)

In [14]:
def chattiest_user(in_se):
    try:
        counts = in_se.value_counts().order(ascending = False)
        return counts.index[0]
    except:
        return np.nan
chatUser_se = chattiest_se.resample('M', how = chattiest_user)

In [15]:
chatUser_se.head()


Out[15]:
created_at
2010-09-30           wesm
2010-10-31           wesm
2010-11-30       mpenning
2010-12-31            knm
2011-01-31    triplechess
Freq: M, dtype: object

In [21]:
nComments_se = chattiest_se.resample('M', how = count)
print nComments_se.head()


created_at
2010-09-30    22
2010-10-31    16
2010-11-30     4
2010-12-31     8
2011-01-31    18
Freq: M, dtype: int64

In [30]:
user = pd.DataFrame({'nIssue':nComments_se, 'chattiest':chatUser_se,\
                     'percentage of the chattiest':percent_se, 'nParticipants':nDistinct_se},\
                    columns = ['nParticipants', 'nIssue', 'chattiest', 'percentage of the chattiest'])

In [117]:
user.columns


Out[117]:
Index([u'nParticipants', u'nIssue', u'chattiest', u'percentage of the chattiest'], dtype=object)

P(9)


In [98]:
id_labels_df = data_df[['id', 'labels', 'created_at']].drop_duplicates(cols = 'id', inplace = False)

In [99]:
id_labels_df.columns


Out[99]:
Index([u'id', u'labels', u'created_at'], dtype=object)

In [101]:
id_labels_list = []
for (idx,Id) in id_labels_df.id.iteritems():
    if len(id_labels_df.labels.ix[idx]):
        for label_dict in id_labels_df.labels.ix[idx]:
            id_labels_list.append((Id, label_dict['name'], id_labels_df.created_at[idx]))
    else:
        id_labels_list.append((Id, np.nan,id_labels_df.created_at[idx]))

In [102]:
id_labels_df = pd.DataFrame(id_labels_list, columns=['id', 'label', 'created_at'])

In [104]:
id_labels_df['created_at'] = pd.to_datetime(id_labels_df['created_at'])

In [106]:
id_labels_df.set_index('created_at', inplace=True)

In [107]:
id_labels_df.head()


Out[107]:
id label
created_at
2010-09-29 00:45:31 337721 NaN
2010-09-29 00:50:13 337726 NaN
2010-09-29 00:50:52 337728 NaN
2010-09-29 00:51:27 337730 NaN
2010-09-29 00:57:00 337736 NaN

In [108]:
issue_df


Out[108]:
created_at
2010-09-30     11
2010-10-31      8
2010-11-30      2
2010-12-31      4
2011-01-31      9
2011-02-28      2
2011-03-31      2
2011-04-30      0
2011-05-31      7
2011-06-30      9
2011-07-31     30
2011-08-31     31
2011-09-30     66
2011-10-31    118
2011-11-30    110
2011-12-31    125
2012-01-31    154
2012-02-29    101
2012-03-31    126
2012-04-30    171
2012-05-31    179
2012-06-30    169
2012-07-31    141
2012-08-31    102
2012-09-30    139
2012-10-31    113
2012-11-30    216
2012-12-31    175
2013-01-31    130
2013-02-28    130
2013-03-31    212
2013-04-30    142
Freq: M, Name: Number of issues, dtype: int64

In [56]:
git_df.labels.ix[2956]


Out[56]:
[{u'color': u'e10c02',
  u'name': u'Bug',
  u'url': u'https://api.github.com/repos/pydata/pandas/labels/Bug'},
 {u'color': u'0b02e1',
  u'name': u'Indexing',
  u'url': u'https://api.github.com/repos/pydata/pandas/labels/Indexing'},
 {u'color': u'e102d8',
  u'name': u'Dtypes',
  u'url': u'https://api.github.com/repos/pydata/pandas/labels/Dtypes'}]

In [54]:
data_df.columns


Out[54]:
Index([u'assignee', u'body', u'closed_at', u'comments', u'comments_url', u'created_at', u'events_url', u'html_url', u'id', u'labels', u'labels_url', u'milestone', u'number', u'pull_request', u'state', u'title', u'updated_at', u'url', u'user'], dtype=object)

In [81]:
user


Out[81]:
nParticipants nIssue chattiest percentage of the chattiest
created_at
2010-09-30 2 22 wesm 81.818182
2010-10-31 3 16 wesm 75.000000
2010-11-30 2 4 mpenning 50.000000
2010-12-31 3 8 knm 50.000000
2011-01-31 5 18 triplechess 33.333333
2011-02-28 2 2 ghost 50.000000
2011-03-31 1 2 ghost 100.000000
2011-04-30 0 0 NaN NaN
2011-05-31 3 7 wesm 71.428571
2011-06-30 3 9 wesm 77.777778
2011-07-31 9 30 wesm 73.333333
2011-08-31 10 31 wesm 67.741935
2011-09-30 14 66 wesm 71.212121
2011-10-31 17 118 wesm 66.101695
2011-11-30 25 110 wesm 53.636364
2011-12-31 22 125 wesm 49.600000
2012-01-31 41 154 wesm 36.363636
2012-02-29 26 101 wesm 48.514851
2012-03-31 47 126 wesm 26.190476
2012-04-30 39 171 wesm 54.970760
2012-05-31 26 179 wesm 49.720670
2012-06-30 55 169 wesm 28.402367
2012-07-31 52 141 wesm 29.078014
2012-08-31 46 102 wesm 13.725490
2012-09-30 53 139 changhiskhan 12.949640
2012-10-31 47 113 y-p 14.159292
2012-11-30 63 216 y-p 24.537037
2012-12-31 66 175 y-p 15.428571
2013-01-31 55 130 jreback 13.076923
2013-02-28 56 130 jreback 19.230769
2013-03-31 56 212 jreback 36.320755
2013-04-30 49 142 y-p 27.464789

In [111]:
data_df.columns


Out[111]:
Index([u'assignee', u'body', u'closed_at', u'comments', u'comments_url', u'created_at', u'events_url', u'html_url', u'id', u'labels', u'labels_url', u'milestone', u'number', u'pull_request', u'state', u'title', u'updated_at', u'url', u'user'], dtype=object)

In [115]:
import numpy as np
from numpy import nan as NaN

In [116]:
NaN


Out[116]:
nan

In [203]:
import pdb
import datetime
import json

import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from numpy import nan as NaN

# Functions defined
def user_extract(dict_in):
	''' Extract 'login' key's value '''''
	return dict_in['login']

def count(resample_in):
	''' Count the number of samples in resampled data'''
	return len(resample_in)

def distinct_user(resample_in):
	''' Count the number of distinct user in resampled data'''
	unique_in = resample_in.drop_duplicates()
	return len(unique_in)

def mean_day(resample_in):
	''' Calculate the mean open day of an issue in resampled data'''
	try:
		#Create a Series has value equals to created_at
		created_at_series = pd.Series(
			resample_in.index, index=resample_in.index)
		diff = resample_in - created_at_series
		sec_mean = diff.sum()/float(len(diff))
		#Convert to timedelta object
		td = datetime.timedelta(microseconds=int(sec_mean/1000.))		
		#return day attribute of the time difference 
		return td.days 
	except Exception:
		return NaN

def comment_func(comment_list):
	''' Extract the comment string from the comments column'''
	try:
		return comment_list[0]['text']
	except Exception:
		return NaN

def chattiest_user(resample_se):
	''' Find the chattiest user in a month'''
	try:
		counts = resample_se.value_counts().order(ascending=True)
		return counts.index[0]
	except Exception:
		return NaN

def distinct(resample_se):
	''' Find the number of distinct user who comment on 'pandas' '''
	return len(resample_se.unique())

def percent(resample_se):
	''' Find the percentage of comments provided by the chattiest user'''
	try:
		counts = resample_se.value_counts().order(ascending=False)
		return int(counts.ix[0]/float(counts.sum())*100)
	except Exception:
		return NaN


# Read and load data
json_file = open('closed.json')
json_data = json.load(json_file)
all_data_df = pd.DataFrame(json_data)

###########
# part(1) #
###########

# p1 is the dataframe have title, created_at, labels, closed_at, user, \
# id as columns
p1 = all_data_df[['title', 'created_at', 'labels', 'closed_at',\
					'user', 'id']]
# transfer the user values to username string
p1.user = p1.user.map(user_extract)

###########
# part(2) #
###########

# Drop the duplicate rows using id inplace
p1.drop_duplicates(cols='id', inplace=True)

###########
# part(4) #
###########

# Convert created_at and closed_at columns from string to datetime
p1['created_at'] = pd.to_datetime(p1['created_at'])
p1['closed_at'] = pd.to_datetime(p1['closed_at'])

###########
# part(5) #
###########

# Set 'created_at' as index
p1.set_index('created_at', inplace=True)
# Make the monthly number of issue plot
issue_month = p1.title.resample('M', how=count)
issue_month.name = 'Number of Issues'
plt.figure()
issue_ax = issue_month.plot()
issue_ax.set_ylabel(issue_month.name)
# Make the monthly distinct user number plot
distinct_month = p1.user.resample('M', how=distinct_user)
distinct_month.name = 'Number of Distinct User'
plt.figure()
distinct_ax = distinct_month.plot()
distinct_ax.set_ylabel(distinct_month.name)

###########
# part(6) #
###########

# Resample the closed_at and return a series of mean open day
open_day = p1.closed_at.resample('M', how=mean_day)
open_day.name = 'Mean Open Day'
# Concatenate monthly issue number with mean open day
open_day = pd.concat([issue_month, open_day], axis=1)
open_day.columns = ['nIssues', 'mean_days']
plt.figure()
line1 = open_day['nIssues'].plot()
line1.set_ylabel('Number of issues')
plt.figure()
line2 = open_day['mean_days'].plot()
line2.set_ylabel('Mean Open Day')
open_day.to_pickle('mean_day.pkl')
print '_'*80
print open_day.head(20)

###########
# part(7) #
###########

# Create the comment dateframe
comm_df = all_data_df[['created_at', 'comments', 'id']]
comm_df.drop_duplicates(cols='id', inplace=True)
comm_df.comments = comm_df.comments.map(comment_func)
comm_df.created_at = pd.to_datetime(comm_df.created_at)
comm_df.set_index('created_at', inplace=True)
print '_'*80
print comm_df.head(20)
comm_df.to_pickle('comments.pkl')

###########
# part(8) #
###########

# Create user dataframe
user_df = all_data_df[['created_at', 'user', 'id']]
user_df.user = user_df.user.map(user_extract)
user_df.created_at = pd.to_datetime(user_df.created_at)
user_df.set_index('created_at', inplace=True)
user_se = user_df['user']
chattiest_df = user_se.resample('M', how=[count,chattiest_user,percent,distinct])
chattiest_df.columns = ['Number of comments', 'The chattiest', 'Percentage of the chattiest(%)',\
						'Number of participants']
print '_'*80						
print chattiest_df.head(20)
chattiest_df.to_pickle('chattiest.pkl')

###########
# part(9) #
###########

# Create id_label dataframe with create time as index
id_labels_temp = data_df[['id', 'labels', 'created_at']]
id_labels_temp.drop_duplicates(cols='id', inplace=True)
id_labels_list = []
for (idx,Id) in id_labels_temp.id.iteritems():
    if len(id_labels_temp.labels.ix[idx]):
        for label_dict in id_labels_temp.labels.ix[idx]:
            id_labels_list.append((Id, label_dict['name'], id_labels_temp.created_at[idx]))
    else:
        id_labels_list.append((Id, np.nan,id_labels_temp.created_at[idx]))
id_labels_df = pd.DataFrame(id_labels_list, columns=['id', 'label', 'created_at'])
id_labels_df['created_at'] = pd.to_datetime(id_labels_df['created_at'])
id_labels_df.set_index('created_at', inplace=True)


________________________________________________________________________________
            nIssues  mean_days
created_at                    
2010-09-30       11        138
2010-10-31        8        250
2010-11-30        2         13
2010-12-31        4          2
2011-01-31        9         52
2011-02-28        2          3
2011-03-31        2          6
2011-04-30        0        NaN
2011-05-31        7         64
2011-06-30        9         49
2011-07-31       30         67
2011-08-31       31         53
2011-09-30       66         39
2011-10-31      118         23
2011-11-30      110         21
2011-12-31      125         26
2012-01-31      154         20
2012-02-29      101         23
2012-03-31      126         30
2012-04-30      171         13
________________________________________________________________________________
                                                              comments      id
created_at                                                                    
2010-09-29 00:45:31                            implemented in git HEAD  337721
2010-09-29 00:50:13                                               done  337726
2010-09-29 00:50:52  I guess I &quot;accidentally&quot; fixed this ...  337728
2010-09-29 00:51:27                                              fixed  337730
2010-09-29 00:57:00  Everything seems to be working in Python 2.7 w...  337736
2010-09-29 05:30:56  All fixed up and wrote unit tests--hopefully d...  337994
2010-09-29 15:41:55  This is a bug. DataMatrix as input to the Data...  338909
2010-09-29 19:45:47  In principle I agree with you that fill should...  339355
2010-09-30 22:29:36  A user suggested this version to start with:\r...  341577
2010-09-30 22:33:14                            Done in latest git HEAD  341581
2010-09-30 22:34:26  This will not go in forthcoming 0.3 release bu...  341583
2010-10-03 17:20:41  You make a good point, and I think it might be...  344725
2010-10-07 23:42:34  Hi Surbas,\r\n\r\nI'm sorry this has taken me ...  352369
2010-10-11 03:19:39                      implemented in recent commits  356064
2010-10-12 16:10:48  added apply and applymap functions to Series f...  358943
2010-10-12 16:13:04     Haven't been able to reproduce this so closing  358947
2010-10-12 16:13:55  Done for DataFrame / WidePanel. Needs to be ad...  358950
2010-10-12 16:15:10                                    duplicate issue  358952
2010-10-22 17:59:31                                  fixed in git HEAD  376890
2010-11-19 14:50:11  Should be safe to use the git HEAD, I will try...  428564
________________________________________________________________________________
            Number of comments The chattiest  Percentage of the chattiest(%)  \
created_at                                                                     
2010-09-30                  22       andylei                              81   
2010-10-31                  16      hector13                              75   
2010-11-30                   4          wesm                              50   
2010-12-31                   8      mpenning                              50   
2011-01-31                  18          wesm                              33   
2011-02-28                   2       tgefell                              50   
2011-03-31                   2         ghost                             100   
2011-04-30                   0           NaN                             NaN   
2011-05-31                   7        surbas                              71   
2011-06-30                   9     dieterv77                              77   
2011-07-31                  30       talltom                              73   
2011-08-31                  31         xdong                              67   
2011-09-30                  66       scottza                              71   
2011-10-31                 118  Komnomnomnom                              66   
2011-11-30                 110    algotr8der                              53   
2011-12-31                 125         MaxBo                              49   
2012-01-31                 154    fonnesbeck                              36   
2012-02-29                 101    yarikoptic                              48   
2012-03-31                 126        brentp                              26   
2012-04-30                 171        nspies                              54   

            Number of participants  
created_at                          
2010-09-30                       2  
2010-10-31                       3  
2010-11-30                       2  
2010-12-31                       3  
2011-01-31                       5  
2011-02-28                       2  
2011-03-31                       1  
2011-04-30                       0  
2011-05-31                       3  
2011-06-30                       3  
2011-07-31                       9  
2011-08-31                      10  
2011-09-30                      14  
2011-10-31                      17  
2011-11-30                      25  
2011-12-31                      22  
2012-01-31                      41  
2012-02-29                      26  
2012-03-31                      47  
2012-04-30                      39  

In [124]:
id_labels_df.head()


Out[124]:
id label created_at
0 337721 NaN 2010-09-29T00:45:31Z
1 337726 NaN 2010-09-29T00:50:13Z
2 337728 NaN 2010-09-29T00:50:52Z
3 337730 NaN 2010-09-29T00:51:27Z
4 337736 NaN 2010-09-29T00:57:00Z

In [129]:
open_day.head()


Out[129]:
nIssues mean_days
created_at
2010-09-30 11 138
2010-10-31 8 250
2010-11-30 2 13
2010-12-31 4 2
2011-01-31 9 52

In [132]:
id_labels_df.head()


Out[132]:
id label
created_at
2010-09-29 00:45:31 337721 NaN
2010-09-29 00:50:13 337726 NaN
2010-09-29 00:50:52 337728 NaN
2010-09-29 00:51:27 337730 NaN
2010-09-29 00:57:00 337736 NaN

In [133]:
p1.closed_at.head()


Out[133]:
created_at
2010-09-29 00:45:31   2011-02-19 23:13:48
2010-09-29 00:50:13   2010-12-17 02:57:33
2010-09-29 00:50:52   2011-01-01 23:50:12
2010-09-29 00:51:27   2010-12-11 06:14:32
2010-09-29 00:57:00   2010-12-17 02:46:34
Name: closed_at, dtype: datetime64[ns]

In [139]:
id_labels_df.ix[60:100]


Out[139]:
id label
created_at
2011-07-18 15:37:46 1242420 Enhancement
2011-07-18 15:37:46 1242420 Testing
2011-07-18 15:39:18 1242434 Enhancement
2011-07-18 15:39:18 1242434 timeseries
2011-07-18 15:43:35 1242459 Enhancement
2011-07-18 15:45:19 1242473 Bug
2011-07-18 15:46:39 1242483 Refactor
2011-07-18 15:47:17 1242492 Enhancement
2011-07-18 15:48:30 1242501 Enhancement
2011-07-18 15:49:35 1242511 Enhancement
2011-07-18 15:50:10 1242517 Enhancement
2011-07-18 15:52:21 1242529 Enhancement
2011-07-18 15:54:10 1242542 Enhancement
2011-07-18 15:54:45 1242545 Testing
2011-07-18 16:02:37 1242597 Enhancement
2011-07-18 21:00:39 1245597 Bug
2011-07-21 19:32:25 1265368 Build problem
2011-07-24 20:08:48 1278525 Enhancement
2011-07-26 01:42:29 1286029 NaN
2011-07-26 15:41:00 1289636 Build problem
2011-07-27 23:02:15 1299665 Bug
2011-07-28 15:04:34 1303422 NaN
2011-07-29 15:53:40 1310939 NaN
2011-07-29 16:26:50 1311144 Enhancement
2011-07-29 16:26:50 1311144 timeseries
2011-07-29 22:19:39 1313137 Enhancement
2011-07-30 01:01:42 1313688 Enhancement
2011-08-01 20:13:57 1325504 NaN
2011-08-05 15:46:35 1351978 Docs
2011-08-06 08:02:19 1355531 NaN
2011-08-07 02:54:27 1359677 NaN
2011-08-07 19:42:03 1361586 Enhancement
2011-08-07 20:57:42 1361833 Enhancement
2011-08-07 21:01:30 1361845 Enhancement
2011-08-07 21:08:46 1361862 Enhancement
2011-08-09 02:25:36 1369747 NaN
2011-08-09 17:29:47 1373938 Enhancement
2011-08-09 18:25:40 1374309 NaN
2011-08-09 20:53:42 1375374 Bug
2011-08-09 22:51:30 1376176 Enhancement

In [135]:
p1.head()


Out[135]:
title labels closed_at user id
created_at
2010-09-29 00:45:31 Enable element-wise comparison operations in D... [] 2011-02-19 23:13:48 wesm 337721
2010-09-29 00:50:13 reindex_like function [] 2010-12-17 02:57:33 wesm 337726
2010-09-29 00:50:52 Binary operations on int DataMatrix [] 2011-01-01 23:50:12 wesm 337728
2010-09-29 00:51:27 Plot keyword arguments are unused in DataFrame... [] 2010-12-11 06:14:32 wesm 337730
2010-09-29 00:57:00 Python 2.7 testing [] 2010-12-17 02:46:34 wesm 337736

In [141]:
def open_day_per_issue(series_in):
	''' Calculate the days an issue takes to be solved '''
	created_at_series = pd.Series(series_in.index, index=series_in.index)
	diff = series_in - created_at_series
	return float(diff.days)

In [147]:
open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-147-014c63418d2e> in <module>()
----> 1 open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2534             values = lib.map_infer(values, lib.Timestamp)
   2535 
-> 2536         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2537         if isinstance(mapped[0], Series):
   2538             from pandas.core.frame import DataFrame

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

<ipython-input-141-838498043c83> in open_day_per_issue(series_in)
      1 def open_day_per_issue(series_in):
      2         ''' Calculate the days an issue takes to be solved '''
----> 3         created_at_series = pd.Series(series_in.index, index=series_in.index)
      4         diff = series_in - created_at_series
      5         return float(diff.days)

AttributeError: 'Timestamp' object has no attribute 'index'
> <ipython-input-141-838498043c83>(3)open_day_per_issue()
      2         ''' Calculate the days an issue takes to be solved '''
----> 3         created_at_series = pd.Series(series_in.index, index=series_in.index)
      4         diff = series_in - created_at_series

ipdb> series_in
Timestamp('2011-02-19 23:13:48', tz=None)
ipdb> d
*** Newest frame
ipdb> u
> /Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.py(2536)apply()
   2535 
-> 2536         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2537         if isinstance(mapped[0], Series):

ipdb> u
> <ipython-input-147-014c63418d2e>(1)<module>()
----> 1 open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)

ipdb> u
*** Oldest frame
ipdb> d
> /Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.py(2536)apply()
   2535 
-> 2536         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2537         if isinstance(mapped[0], Series):

ipdb> d
> <ipython-input-141-838498043c83>(3)open_day_per_issue()
      2         ''' Calculate the days an issue takes to be solved '''
----> 3         created_at_series = pd.Series(series_in.index, index=series_in.index)
      4         diff = series_in - created_at_series

ipdb> exit

In [143]:
type(p1.closed_at)


Out[143]:
pandas.core.series.TimeSeries

In [144]:
p1.closed_at.index


Out[144]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2010-09-29 00:45:31, ..., 2013-04-28 15:27:23]
Length: 2934, Freq: None, Timezone: None

In [145]:
%pdb


Automatic pdb calling has been turned ON

In [149]:
user.ix[0][1].index


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-149-53ba29158ff5> in <module>()
----> 1 user.ix[0][1].index

AttributeError: 'numpy.int64' object has no attribute 'index'
> <ipython-input-149-53ba29158ff5>(1)<module>()
----> 1 user.ix[0][1].index

ipdb> exit

In [151]:
p1_reset = p1.reset_index()

In [152]:
p1_reset.head()


Out[152]:
created_at title labels closed_at user id
0 2010-09-29 00:45:31 Enable element-wise comparison operations in D... [] 2011-02-19 23:13:48 wesm 337721
1 2010-09-29 00:50:13 reindex_like function [] 2010-12-17 02:57:33 wesm 337726
2 2010-09-29 00:50:52 Binary operations on int DataMatrix [] 2011-01-01 23:50:12 wesm 337728
3 2010-09-29 00:51:27 Plot keyword arguments are unused in DataFrame... [] 2010-12-11 06:14:32 wesm 337730
4 2010-09-29 00:57:00 Python 2.7 testing [] 2010-12-17 02:46:34 wesm 337736

In [161]:
(p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-161-b15a1a9d3ddc> in <module>()
----> 1 (p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())

TypeError: descriptor 'total_seconds' of 'datetime.timedelta' object needs an argument
> <ipython-input-161-b15a1a9d3ddc>(1)<module>()
----> 1 (p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())

ipdb> exit

In [169]:
a = -(p1_reset['created_at']-p1_reset['closed_at'])

In [164]:
a.map(datetime.timedelta.total_seconds)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-164-65772c304ca2> in <module>()
----> 1 a.map(datetime.timedelta.total_seconds)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

TypeError: descriptor 'total_seconds' requires a 'datetime.timedelta' object but received a 'numpy.timedelta64'
> /Users/Yigong/Documents/Python/AY250/hw7/inference.pyx(864)pandas.lib.map_infer (pandas/lib.c:42840)()

ipdb> exit

In [167]:
a.ix[0][1]


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-167-f5444cee41aa> in <module>()
----> 1 a.ix[0][1]

IndexError: invalid index to scalar variable.
> <ipython-input-167-f5444cee41aa>(1)<module>()
----> 1 a.ix[0][1]

ipdb> exit

In [171]:
a.ix[0]


Out[171]:
numpy.timedelta64(12436097000000000,'ns')

In [173]:
p1_reset


Out[173]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2934 entries, 0 to 2933
Data columns (total 6 columns):
created_at    2934  non-null values
title         2934  non-null values
labels        2934  non-null values
closed_at     2934  non-null values
user          2934  non-null values
id            2934  non-null values
dtypes: datetime64[ns](2), int64(1), object(3)

In [174]:
created_at = p1_reset['created_at']
closed_at = p1_reset['closed_at']

In [175]:
time_diff = closed_at - created_at

In [176]:
time_diff.head()


Out[176]:
0   143 days, 22:28:17
1    79 days, 02:07:20
2    94 days, 22:59:20
3    73 days, 05:23:05
4    79 days, 01:49:34
dtype: timedelta64[ns]

In [177]:
time_diff.map(datetime.timedelta.total_seconds)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-177-074ab4b0a9de> in <module>()
----> 1 time_diff.map(datetime.timedelta.total_seconds)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

TypeError: descriptor 'total_seconds' requires a 'datetime.timedelta' object but received a 'numpy.timedelta64'
> /Users/Yigong/Documents/Python/AY250/hw7/inference.pyx(864)pandas.lib.map_infer (pandas/lib.c:42840)()

ipdb> exit

In [178]:
x = np.timedelta64(2069211000000000, 'ns')
>>> days = x.astype('timedelta64[D]')

In [179]:
days


Out[179]:
numpy.timedelta64(23,'D')

In [181]:
a.map(lambda td : td.days)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-181-dfe5f4c2ec11> in <module>()
----> 1 a.map(lambda td : td.days)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

<ipython-input-181-dfe5f4c2ec11> in <lambda>(td)
----> 1 a.map(lambda td : td.days)

AttributeError: 'numpy.timedelta64' object has no attribute 'days'
> <ipython-input-181-dfe5f4c2ec11>(1)<lambda>()
----> 1 a.map(lambda td : td.days)

ipdb> eit
*** NameError: name 'eit' is not defined
ipdb> exit

In [189]:
t = a[0]

In [190]:
t


Out[190]:
numpy.timedelta64(12436097000000000,'ns')

In [191]:
24*60*60*1e9


Out[191]:
86400000000000.0

In [192]:
created = pd.Series(p1.index, index=p1.index)
closed = p1.closed_at

In [195]:
time_diff = closed - created

In [196]:
time_diff.head()


Out[196]:
created_at
2010-09-29 00:45:31   143 days, 22:28:17
2010-09-29 00:50:13    79 days, 02:07:20
2010-09-29 00:50:52    94 days, 22:59:20
2010-09-29 00:51:27    73 days, 05:23:05
2010-09-29 00:57:00    79 days, 01:49:34
dtype: timedelta64[ns]

In [204]:
created = pd.Series(p1.index, index=p1.index)
closed = p1.closed_at
time_diff = closed - created
SEC_DAY = 24*60*60*1e9
open_day_per_issue = time_diff.map(lambda td: td/SEC_DAY)
open_day_per_issue.name = 'open_day'
print open_day_per_issue.head()


created_at
2010-09-29 00:45:31    143
2010-09-29 00:50:13     79
2010-09-29 00:50:52     94
2010-09-29 00:51:27     73
2010-09-29 00:57:00     79
Name: open_day, dtype: int64

In [ ]:
open_day_per_issue

In [199]:
id_labels_df.head()


Out[199]:
id label
created_at
2010-09-29 00:45:31 337721 NaN
2010-09-29 00:50:13 337726 NaN
2010-09-29 00:50:52 337728 NaN
2010-09-29 00:51:27 337730 NaN
2010-09-29 00:57:00 337736 NaN

In [219]:
label_open_df = pd.merge(id_labels_df.reset_index(), open_day_per_issue.reset_index(), on = 'created_at')

In [236]:
label_open_df.ix[label_open_df['label']=='Bug']['open_day']


Out[236]:
7      266
17     347
43      33
46      11
58       2
65      12
75       1
80       3
98       0
100      9
107      3
108     14
113     11
121      3
128      1
...
3716    0
3719    0
3737    9
3765    4
3771    3
3778    2
3785    0
3791    0
3795    0
3798    0
3803    0
3806    0
3807    0
3811    0
3816    0
Name: open_day, Length: 956, dtype: int64

In [231]:
a = label_open_df['label'].drop_duplicates()

In [235]:
a.values


Out[235]:
array([nan, u'Bug', u'Enhancement', u'Testing', u'timeseries', u'Refactor',
       u'Build problem', u'Docs', u'groupby', u'Ideas', u'unicode',
       u'Data IO', u'prio-high', u'prio-medium', u'prio-low',
       u'Visualization', u'Community', u'missing-data', u'Stats',
       u'Indexing', u'Output-Formatting', u"Can't Repro", u'Performance',
       u'Reshaping', u'Multithreading', u'Dtypes', u'Good as first PR',
       u'API', u'Note To Selves', u'Regression', u'Usage'], dtype=object)

In [212]:
open_day_per_issue.reset_index()


Out[212]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2934 entries, 0 to 2933
Data columns (total 2 columns):
created_at    2934  non-null values
open_day      2934  non-null values
dtypes: datetime64[ns](1), int64(1)

In [269]:
label_open_df = pd.merge(id_labels_df.reset_index(), open_day_per_issue.reset_index(),\
		 on='created_at')
label_open_df.set_index('created_at', inplace=True)
bug_df = label_open_df.ix[label_open_df['label']=='Bug']
LABELS = label_open_df['label'].drop_duplicates().values[1:]
time_per_label = pd.DataFrame([])
for label_str in LABELS:
    temp = label_open_df.ix[label_open_df['label']==label_str]['open_day']
    label_month = temp.resample('M', how='mean')
    label_month.name = label_str
    time_per_label = time_per_label.join(label_month, how='outer')
print time_per_label.head()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2010-09-30 00:00:00 to 2011-01-31 00:00:00
Freq: M
Data columns (total 30 columns):
Bug                  2  non-null values
Enhancement          1  non-null values
Testing              0  non-null values
timeseries           0  non-null values
Refactor             0  non-null values
Build problem        0  non-null values
Docs                 0  non-null values
groupby              0  non-null values
Ideas                0  non-null values
unicode              0  non-null values
Data IO              0  non-null values
prio-high            0  non-null values
prio-medium          0  non-null values
prio-low             0  non-null values
Visualization        0  non-null values
Community            0  non-null values
missing-data         0  non-null values
Stats                0  non-null values
Indexing             0  non-null values
Output-Formatting    0  non-null values
Can't Repro          0  non-null values
Performance          0  non-null values
Reshaping            0  non-null values
Multithreading       0  non-null values
Dtypes               0  non-null values
Good as first PR     0  non-null values
API                  0  non-null values
Note To Selves       0  non-null values
Regression           0  non-null values
Usage                0  non-null values
dtypes: float64(30), object(0)

In [273]:
time_per_label.ix[:5,:5]


Out[273]:
Bug Enhancement Testing timeseries Refactor
created_at
2010-09-30 266 NaN NaN NaN NaN
2010-10-31 347 532.5 NaN NaN NaN
2010-11-30 NaN NaN NaN NaN NaN
2010-12-31 NaN NaN NaN NaN NaN
2011-01-31 NaN NaN NaN NaN NaN

In [281]:
time_per_label[['Bug', 'Enhancement']].plot(label=['a','b'])


Out[281]:
<matplotlib.axes.AxesSubplot at 0x115f9b350>

In [277]:
f1 = plt.figure()

In [279]:
ax1 = f1.add_axes

In [245]:
label_month


Out[245]:
created_at
2010-09-30    266.000000
2010-10-31    347.000000
2010-11-30           NaN
2010-12-31           NaN
2011-01-31           NaN
2011-02-28           NaN
2011-03-31           NaN
2011-04-30           NaN
2011-05-31     33.000000
2011-06-30     11.000000
2011-07-31      4.500000
2011-08-31      7.400000
2011-09-30     16.058824
2011-10-31      0.652174
2011-11-30     12.076923
2011-12-31     24.357143
2012-01-31     11.574468
2012-02-29     17.578947
2012-03-31     15.515152
2012-04-30     13.655738
2012-05-31      5.333333
2012-06-30      5.454545
2012-07-31     10.657534
2012-08-31     19.072727
2012-09-30     16.358491
2012-10-31     18.111111
2012-11-30      5.140845
2012-12-31      8.125000
2013-01-31     25.235294
2013-02-28     25.666667
2013-03-31      4.081633
2013-04-30      1.090909
Freq: M, Name: Bug, dtype: float64

In [240]:
time_per_label = pd.DataFrame([], index=label_open_df.index)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-240-0aec6ab49ba4> in <module>()
----> 1 time_per_label = pd.DataFrame([], index=label_open_df.index)

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    432             else:
    433                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 434                                          copy=copy)
    435         else:
    436             try:

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_ndarray(self, values, index, columns, dtype, copy)
    559             columns = _ensure_index(columns)
    560 
--> 561         return create_block_manager_from_blocks([ values.T ], [ columns, index ])
    562 
    563     def _wrap_array(self, arr, axes, copy=False):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_blocks(blocks, axes)
   2232         blocks = [ getattr(b,'values',b) for b in blocks ]
   2233         tot_items = sum(b.shape[0] for b in blocks)
-> 2234         construction_error(tot_items,blocks[0].shape[1:],axes)
   2235 
   2236 def create_block_manager_from_arrays(arrays, names, axes):

/Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in construction_error(tot_items, block_shape, axes)
   2214     raise ValueError("Shape of passed values is %s, indices imply %s" % (
   2215             tuple(map(int, [tot_items] + list(block_shape))),
-> 2216             tuple(map(int, [len(ax) for ax in axes]))))
   2217 
   2218 

ValueError: Shape of passed values is (0, 0), indices imply (0, 3825)
> /Users/Yigong/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.py(2216)construction_error()
   2215             tuple(map(int, [tot_items] + list(block_shape))),
-> 2216             tuple(map(int, [len(ax) for ax in axes]))))
   2217 

ipdb> exit

In [268]:
for label_str in LABELS:
    temp = label_open_df.ix[label_open_df['label']==label_str]['open_day']
    label_month = temp.resample('M', how='mean')
    label_month.name = label_str
    time_per_label = time_per_label.join(label_month, how='outer')
print time_per_label.head()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2010-09-30 00:00:00 to 2011-01-31 00:00:00
Freq: M
Data columns (total 30 columns):
Bug                  2  non-null values
Enhancement          1  non-null values
Testing              0  non-null values
timeseries           0  non-null values
Refactor             0  non-null values
Build problem        0  non-null values
Docs                 0  non-null values
groupby              0  non-null values
Ideas                0  non-null values
unicode              0  non-null values
Data IO              0  non-null values
prio-high            0  non-null values
prio-medium          0  non-null values
prio-low             0  non-null values
Visualization        0  non-null values
Community            0  non-null values
missing-data         0  non-null values
Stats                0  non-null values
Indexing             0  non-null values
Output-Formatting    0  non-null values
Can't Repro          0  non-null values
Performance          0  non-null values
Reshaping            0  non-null values
Multithreading       0  non-null values
Dtypes               0  non-null values
Good as first PR     0  non-null values
API                  0  non-null values
Note To Selves       0  non-null values
Regression           0  non-null values
Usage                0  non-null values
dtypes: float64(30), object(0)

In [282]:
!README.md


/bin/sh: README.md: command not found

In [283]:
ls


11_pandas.ipynb                   hw_7_pandas_and_timeseries.ipynb
12_pandas-Copy0.ipynb             hw_7_pandas_and_timeseries.py
12_pandas.ipynb                   legit.ipynb
Lecture.ipynb                     mean_day.pkl
README.md                         question.ipynb
SFHousing.csv                     test.ipynb
chattiest.pkl                     test2.ipynb
closed.json                       test_1.py
comments.pkl                      test_book.ipynb
fred_fx.csv                       time_label.pkl
hw7.py

HW7

Everything is included in hw7.py. So all you need to do is typing this in the command line: python hw7.py

HW7

Everything is included in hw7.py. So all you need to do is typing this in the command line: python hw7.py

HW7

Everything is included in hw7.py. So all you need to do is typing this in the command line: python hw7.py

Tables for 6), 7), 8), 11) are mean_day.pkl, comments.pkl, chattiest.pkl and time_label.pkl, respectively.


In [ ]: