In [2]:
# import the graphlab
import graphlab as gl
In [3]:
# reading the csv file
data = gl.SFrame.read_csv("C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv")
In [4]:
# getting no of rows and columns
data.num_rows(), data.num_columns() , len(data)
Out[4]:
In [5]:
# saving the dataset
data.save('songs')
In [6]:
# loading the dataset
sdata = gl.load_sframe('songs')
In [7]:
# head 5 rows
sdata.head(5)
Out[7]:
In [8]:
# likewise tail 4 rows
sdata.tail(4)
Out[8]:
In [9]:
# checking the column names and data type
sdata.column_types(), sdata.column_names()
Out[9]:
In [10]:
# creating a new columns
sdata['my_rating'] = 0
sdata
Out[10]:
In [11]:
# generating a new colmns called song_age respective to my age
my_age = 1989
sdata['song_age']=sdata['year']-my_age
sdata
Out[11]:
In [12]:
# generating another column title length
sdata['title length'] = sdata['title'].apply(lambda x : len(x.split()))
sdata
Out[12]:
In [13]:
# adding one more column
sdata.add_column(sdata.select_column('year').apply(lambda x: x - my_age),name='how_old_was_i')
sdata
Out[13]:
In [14]:
# can add multiple columns at a time
sdata[['col1','col2']] = [sdata['title length'],sdata['my_rating']]
sdata
Out[14]:
In [15]:
#adding some more
#sdata[['col3','col4']] = [[3,4]] this is not allowed, assigned values should be SArrray
#sdata
In [16]:
# deleting column
del sdata['song_age']
In [17]:
# rename columns
sdata.rename({'title length':'title_length'})
sdata
Out[17]:
In [18]:
# swap columns location
sdata.swap_columns('year','title_length')
sdata
Out[18]:
In [19]:
# change the column data type
sdata.column_names(),sdata.column_types()
Out[19]:
In [20]:
sdata['my_rating'] = sdata['my_rating'].astype(float)
sdata.column_types()
Out[20]:
In [21]:
# swap column location
sdata.swap_columns('title_length','year')
Out[21]:
In [22]:
# let's try to calculate how many 'a' are there in 'title', 'release' & 'artist_name' column
sdata.add_column(sdata['title', 'release' , 'artist_name'].apply(lambda row:sum(word.count('a') for word in row.values())),'no_of_a')
sdata.head(3)
Out[22]:
In [23]:
# checking the missing year
year_count = sdata.groupby('year', gl.aggregate.COUNT)
year_count.head(6)
Out[23]:
In [24]:
# no of unique years
print "No of unique years :", str(len(year_count))
In [25]:
print "no of invalid year count "
year_count.topk('year', reverse=True, k=1)
Out[25]:
In [26]:
# year 0 is invalid value to better convert it to None
sdata['year'] = sdata['year'].apply(lambda x : None if x==0 else x)
sdata.head(5)
Out[26]:
In [27]:
# now I have to fix the value of 'How_old_Was_i' column as well
# I can do this two ways, one - substract - year - born_year
# or I can use the same apply function with NONE if x < 0
sdata['how_old_was_i'] = sdata['year'].apply(lambda x : None if x is None else x-my_age)
sdata.head(5)
Out[27]:
In [28]:
# check the no of songs which have valid years
print len(sdata[sdata['year']>0])
In [29]:
tmp = sdata['year']>0
print tmp
In [30]:
# Look at lots of descriptive statistics of title_length
print "mean: " + str(sdata['title_length'].mean())
print "std: " + str(sdata['title_length'].std())
print "var: " + str(sdata['title_length'].var())
print "min: " + str(sdata['title_length'].min())
print "max: " + str(sdata['title_length'].max())
print "sum: " + str(sdata['title_length'].sum())
print "number of non-zero entries: " + str(sdata['title_length'].nnz())
In [31]:
approx_sketch = sdata['title_length'].sketch_summary()
print approx_sketch
In [32]:
# lets check which songs are having largest and smallest length
top_title_length = sdata.topk('title_length')
print top_title_length
In [33]:
# what about lowest
lowest_title_lenght = sdata.topk('title_length', reverse=True)
print lowest_title_lenght
In [34]:
# lowest 15
# what about lowest
lowest_title_lenght = sdata.topk('title_length', reverse=True, k =15)
print lowest_title_lenght
In [35]:
before_i_was_born = sdata['how_old_was_i'] < 0
before_i_was_born.all(), before_i_was_born.any()
Out[35]:
In [36]:
# get total songs in an album and display the top album by no
sdata.groupby(['artist_name','release'], {'no_of_songs_in_album':gl.aggregate.COUNT} ).topk('no_of_songs_in_album')
Out[36]:
In [37]:
# this will download the 118 MB file
#usage_data = gl.SFrame.read_csv("https://static.turi.com/datasets/millionsong/10000.txt", header=False, delimiter='\t', column_type_hints={'X3':int})
#usage_data.rename({'X1':'user_id', 'X2':'song_id', 'X3':'listen_count'})
In [38]:
# Read the data
usage_data = gl.SFrame.read_csv("C:/learn/ML/ML00caseStudy/week01Intro/10000.txt", header=False, delimiter="\t", column_type_hints={'X3':int})
usage_data.rename({'X1':'user_id', 'X2':'song_id', 'X3':'listen_count'})
Out[38]:
In [39]:
# saving this data frame
usage_data.save('usage_data')
# loading the data
usage_data = gl.load_sframe('usage_data')
In [40]:
# find out the unique users
print len(usage_data['user_id'].unique())
In [41]:
# let's create two datasets which we can join
ds1 = sdata[((sdata['artist_name'] == 'Relient K')
| (sdata['artist_name'] == 'Streetlight Manifesto'))
& (sdata['how_old_was_i'] >= 14) & (sdata['how_old_was_i'] <= 18)]
ds1
Out[41]:
In [42]:
# Let's join ds1 with the 10000.txt dataset usage_data
dsjoin = ds1.join(usage_data, 'song_id')
dsjoin
Out[42]:
In [43]:
# total row in ds1 and dsjoin datasets
len(ds1), len(dsjoin)
Out[43]:
In [44]:
len(ds1['song_id'].unique()), len(dsjoin['song_id'].unique()), len(ds1['song_id']), len(dsjoin['song_id'])
Out[44]:
In [45]:
# find out most popular songs when I was between 14 n 18
most_popular = dsjoin.groupby(['song_id'], {'total_listen_count':gl.aggregate.SUM('listen_count'),
'num_unique_users':gl.aggregate.COUNT('user_id')})
most_popular
Out[45]:
In [46]:
# to get artist name we have to join this data
most_popular.join(sdata, 'song_id').topk('total_listen_count',k=20)
Out[46]:
In [47]:
# let's append a row with max liste count and check whether it comes in above result or not
me = gl.SFrame({'user_id':['evan'],'song_id':['SOSFAVU12A6D4FDC6A'],'listen_count':[4000]})
# adding this data to usage data
usage_data = usage_data.append(me)
In [48]:
# repeating the above join n group by statement
dsjoin = ds1.join(usage_data, 'song_id')
most_popular = dsjoin.groupby(['song_id'], {'total_listen_count':gl.aggregate.SUM('listen_count'),
'num_unique_users':gl.aggregate.COUNT('user_id')})
most_popular.join(sdata, 'song_id').topk('total_listen_count',k=20)
Out[48]:
In [49]:
# Randomly split data rows into two subsets
first_set, second_set = sdata.random_split(0.8, seed = 1)
first_set.num_rows(), second_set.num_rows()
Out[49]:
If you want to split on a predicate though, you'll have to do that manually.
In [50]:
songs_before = sdata[sdata['how_old_was_i'] < 0]
songs_after = sdata[sdata['how_old_was_i'] >= 0]
songs_before.num_rows(), songs_after.num_rows()
Out[50]:
In [51]:
# generating sample data
sample = sdata.sample(0.4)
sample.num_rows()
Out[51]:
In [52]:
arr = gl.SArray([1,2,3])
arr
Out[52]:
In [53]:
arr2 = 2*arr
arr2
Out[53]:
In [54]:
# add
arr + arr2
Out[54]:
In [55]:
# multiply
arr * arr2
Out[55]:
In [56]:
# divide
arr2 / arr
Out[56]:
In [57]:
# iterating with SFrame
for i in sdata:
if i['title_length'] >= 45:
print "Whoa that's long!"
In [58]:
sdata['title_artist_length'] = sdata['title','artist_name'].apply(lambda row: sum([len(col) for col in row.values()]))
sdata
Out[58]:
In [59]:
# save as csv
sdata.save('sdata_new.csv', format='csv')
In [60]:
sdata.save('sdata_new')
In [ ]: