In [3]:
import pandas as pd
In [17]:
df1 = pd.read_csv("Youtube Subscribed vs Not Subscribed.csv")
df1.set_index("date",inplace=True)
df2 = pd.read_csv("Youtube Net Subscribers.csv")
df2.set_index("date",inplace=True)
df2.head()
Out[17]:
views
averageViewDuration
averageViewPercentage
comments
likes
dislikes
estimatedMinutesWatched
subscribersGained
subscribersLost
netSubscribers
totalWatchTime
date
2015-12-01
2927
135
64.175947
3
28
0
6599
43
1
42
395145
2015-11-30
4872
142
71.099189
3
32
2
11550
45
0
45
691824
2015-11-29
10281
142
71.707858
4
23
1
24452
89
5
84
1459902
2015-11-28
9906
150
77.324290
21
44
0
24886
155
4
151
1485900
2015-11-27
14725
149
79.935564
15
76
3
36808
211
5
206
2194025
In [18]:
df2.head()
views = df1["views"] > 0
subscribers = df1["subscribedStatus"]=="SUBSCRIBED"
df2[views & subscribers]
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:1942: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
"DataFrame index.", UserWarning)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-5bec1b15ba57> in <module>()
2 views = df1["views"] > 0
3 subscribers = df1["subscribedStatus"]=="SUBSCRIBED"
----> 4 df2[views & subscribers]
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
1906 if isinstance(key, (Series, np.ndarray, Index, list)):
1907 # either boolean or fancy integer index
-> 1908 return self._getitem_array(key)
1909 elif isinstance(key, DataFrame):
1910 return self._getitem_frame(key)
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_array(self, key)
1946 # check_bool_indexer will throw exception if Series key cannot
1947 # be reindexed to match DataFrame rows
-> 1948 key = check_bool_indexer(self.index, key)
1949 indexer = key.nonzero()[0]
1950 return self.take(indexer, axis=0, convert=False)
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in check_bool_indexer(ax, key)
1664 result = key
1665 if isinstance(key, ABCSeries) and not key.index.equals(ax):
-> 1666 result = result.reindex(ax)
1667 mask = com.isnull(result._values)
1668 if mask.any():
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in reindex(self, index, **kwargs)
2257 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2258 def reindex(self, index=None, **kwargs):
-> 2259 return super(Series, self).reindex(index=index, **kwargs)
2260
2261 @Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in reindex(self, *args, **kwargs)
1846 # perform the reindex on the axes
1847 return self._reindex_axes(axes, level, limit, tolerance,
-> 1848 method, fill_value, copy).__finalize__(self)
1849
1850 def _reindex_axes(self, axes, level, limit, tolerance, method,
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
1865 obj = obj._reindex_with_indexers(
1866 {axis: [new_index, indexer]},
-> 1867 fill_value=fill_value, copy=copy, allow_dups=False)
1868
1869 return obj
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
1957 fill_value=fill_value,
1958 allow_dups=allow_dups,
-> 1959 copy=copy)
1960
1961 if copy and new_data is self._data:
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3390 # some axes don't allow reindexing with dups
3391 if not allow_dups:
-> 3392 self.axes[axis]._can_reindex(indexer)
3393
3394 if axis >= self.ndim:
/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in _can_reindex(self, indexer)
2017 # trying to reindex on an axis with duplicates
2018 if not self.is_unique and len(indexer):
-> 2019 raise ValueError("cannot reindex from a duplicate axis")
2020
2021 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
In [24]:
subscribers = df1["subscribedStatus"] == "SUBSCRIBED"
df1[subscribers]
df2.loc[:,"subscriberViews"] = df1[subscribers]["views"]
In [25]:
df2.head()
Out[25]:
views
averageViewDuration
averageViewPercentage
comments
likes
dislikes
estimatedMinutesWatched
subscribersGained
subscribersLost
netSubscribers
totalWatchTime
subscriberViews
date
2015-12-01
2927
135
64.175947
3
28
0
6599
43
1
42
395145
442
2015-11-30
4872
142
71.099189
3
32
2
11550
45
0
45
691824
586
2015-11-29
10281
142
71.707858
4
23
1
24452
89
5
84
1459902
907
2015-11-28
9906
150
77.324290
21
44
0
24886
155
4
151
1485900
1124
2015-11-27
14725
149
79.935564
15
76
3
36808
211
5
206
2194025
1337
In [ ]:
Content source: facemelters/data-science
Similar notebooks: