notebook.community

Edit and run



In [2]:

    
# import the graphlab
import graphlab as gl



In [3]:

    
# reading the csv file
data = gl.SFrame.read_csv("C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv")









    



[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Atul\AppData\Local\Temp\graphlab_server_1469128219.log.0
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: C:\Users\Atul\AppData\Local\Temp\graphlab_server_1469128219.log.0






    




Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv






    




Parsing completed. Parsed 100 lines in 4.06285 secs.






    



This non-commercial license of GraphLab Create for academic use is assigned to atul9806@yahoo.in and will expire on December 09, 2016.
------------------------------------------------------





    




Read 637410 lines. Lines per second: 192485






    




Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv






    




Parsing completed. Parsed 1000000 lines in 3.68689 secs.






    



Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



In [4]:

    
# getting no of rows and columns
data.num_rows(), data.num_columns() , len(data)









    Out[4]:





(1000000, 5, 1000000)



In [5]:

    
# saving the dataset
data.save('songs')



In [6]:

    
# loading the dataset
sdata = gl.load_sframe('songs')



In [7]:

    
# head 5 rows
sdata.head(5)









    Out[7]:





    
        song_id
        title
        release
        artist_name
        year
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
    

[5 rows x 5 columns]



In [8]:

    
# likewise tail 4 rows
sdata.tail(4)









    Out[8]:





    
        song_id
        title
        release
        artist_name
        year
    
    
        SOXQYIQ12A8C137FBB
        Jago Chhadeo
        Naale Baba Lassi Pee Gya
        Kuldeep Manak
        0
    
    
        SOHODZI12A8C137BB3
        Novemba
        Dub_Connected: electronic
music ...
        Gabriel Le Mar
        0
    
    
        SOLXGOR12A81C21EB7
        Faraday
        The Trance Collection
Vol. 2 ...
        Elude
        0
    
    
        SOWXJXQ12AB0189F43
        Fernweh feat. Sektion
Kuchikäschtli ...
        So Oder So
        Texta
        2004
    

[4 rows x 5 columns]



In [9]:

    
# checking the column names and data type
sdata.column_types(), sdata.column_names()









    Out[9]:





([str, str, str, str, int],
 ['song_id', 'title', 'release', 'artist_name', 'year'])

Modify the data



In [10]:

    
# creating a new columns
sdata['my_rating'] = 0
sdata









    Out[10]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0
    

[1000000 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [11]:

    
# generating a new colmns called song_age respective to my age

my_age = 1989

sdata['song_age']=sdata['year']-my_age
sdata









    Out[11]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0
    


    
        song_age
    
    
        14
    
    
        6
    
    
        17
    
    
        14
    
    
        -1989
    
    
        -1989
    
    
        -1989
    
    
        4
    
    
        -1989
    
    
        -1989
    

[1000000 rows x 7 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [12]:

    
# generating another column title length
sdata['title length'] = sdata['title'].apply(lambda x : len(x.split()))
sdata









    Out[12]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0
    


    
        song_age
        title length
    
    
        14
        2
    
    
        6
        2
    
    
        17
        4
    
    
        14
        3
    
    
        -1989
        3
    
    
        -1989
        9
    
    
        -1989
        4
    
    
        4
        4
    
    
        -1989
        1
    
    
        -1989
        6
    

[1000000 rows x 8 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [13]:

    
# adding one more column
sdata.add_column(sdata.select_column('year').apply(lambda x: x - my_age),name='how_old_was_i')
sdata









    Out[13]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0
    


    
        song_age
        title length
        how_old_was_i
    
    
        14
        2
        14
    
    
        6
        2
        6
    
    
        17
        4
        17
    
    
        14
        3
        14
    
    
        -1989
        3
        -1989
    
    
        -1989
        9
        -1989
    
    
        -1989
        4
        -1989
    
    
        4
        4
        4
    
    
        -1989
        1
        -1989
    
    
        -1989
        6
        -1989
    

[1000000 rows x 9 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [14]:

    
# can add multiple columns at a time
sdata[['col1','col2']] = [sdata['title length'],sdata['my_rating']]
sdata









    Out[14]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0
    


    
        song_age
        title length
        how_old_was_i
        col1
        col2
    
    
        14
        2
        14
        2
        0
    
    
        6
        2
        6
        2
        0
    
    
        17
        4
        17
        4
        0
    
    
        14
        3
        14
        3
        0
    
    
        -1989
        3
        -1989
        3
        0
    
    
        -1989
        9
        -1989
        9
        0
    
    
        -1989
        4
        -1989
        4
        0
    
    
        4
        4
        4
        4
        0
    
    
        -1989
        1
        -1989
        1
        0
    
    
        -1989
        6
        -1989
        6
        0
    

[1000000 rows x 11 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [15]:

    
#adding some more
#sdata[['col3','col4']] = [[3,4]]  this is not allowed, assigned values should be SArrray
#sdata



In [16]:

    
# deleting column
del sdata['song_age']



In [17]:

    
# rename columns
sdata.rename({'title length':'title_length'})
sdata









    Out[17]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0
    


    
        title_length
        how_old_was_i
        col1
        col2
    
    
        2
        14
        2
        0
    
    
        2
        6
        2
        0
    
    
        4
        17
        4
        0
    
    
        3
        14
        3
        0
    
    
        3
        -1989
        3
        0
    
    
        9
        -1989
        9
        0
    
    
        4
        -1989
        4
        0
    
    
        4
        4
        4
        0
    
    
        1
        -1989
        1
        0
    
    
        6
        -1989
        6
        0
    

[1000000 rows x 10 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [18]:

    
# swap columns location
sdata.swap_columns('year','title_length')
sdata









    Out[18]:





    
        song_id
        title
        release
        artist_name
        title_length
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        2
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        4
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        3
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        3
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        9
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        4
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        4
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        1
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        6
    


    
        my_rating
        year
        how_old_was_i
        col1
        col2
    
    
        0
        2003
        14
        2
        0
    
    
        0
        1995
        6
        2
        0
    
    
        0
        2006
        17
        4
        0
    
    
        0
        2003
        14
        3
        0
    
    
        0
        0
        -1989
        3
        0
    
    
        0
        0
        -1989
        9
        0
    
    
        0
        0
        -1989
        4
        0
    
    
        0
        1993
        4
        4
        0
    
    
        0
        0
        -1989
        1
        0
    
    
        0
        0
        -1989
        6
        0
    

[1000000 rows x 10 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [19]:

    
# change the column data type
sdata.column_names(),sdata.column_types()









    Out[19]:





(['song_id',
  'title',
  'release',
  'artist_name',
  'title_length',
  'my_rating',
  'year',
  'how_old_was_i',
  'col1',
  'col2'],
 [str, str, str, str, int, int, int, int, int, int])



In [20]:

    
sdata['my_rating'] = sdata['my_rating'].astype(float)
sdata.column_types()









    Out[20]:





[str, str, str, str, int, float, int, int, int, int]



In [21]:

    
# swap column location
sdata.swap_columns('title_length','year')









    Out[21]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0.0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0.0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0.0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0.0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        0
        0.0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        0
        0.0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        0
        0.0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0.0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        0
        0.0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        0
        0.0
    


    
        title_length
        how_old_was_i
        col1
        col2
    
    
        2
        14
        2
        0
    
    
        2
        6
        2
        0
    
    
        4
        17
        4
        0
    
    
        3
        14
        3
        0
    
    
        3
        -1989
        3
        0
    
    
        9
        -1989
        9
        0
    
    
        4
        -1989
        4
        0
    
    
        4
        4
        4
        0
    
    
        1
        -1989
        1
        0
    
    
        6
        -1989
        6
        0
    

[1000000 rows x 10 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [22]:

    
# let's try to calculate how many 'a' are there in 'title', 'release' & 'artist_name' column
sdata.add_column(sdata['title', 'release' , 'artist_name'].apply(lambda row:sum(word.count('a') for word in row.values())),'no_of_a')
sdata.head(3)









    Out[22]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
        title_length
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0.0
        2
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0.0
        2
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0.0
        4
    


    
        how_old_was_i
        col1
        col2
        no_of_a
    
    
        14
        2
        0
        5
    
    
        6
        2
        0
        8
    
    
        17
        4
        0
        1
    

[3 rows x 11 columns]

Checking Missing values



In [23]:

    
# checking the missing year
year_count = sdata.groupby('year', gl.aggregate.COUNT)
year_count.head(6)









    Out[23]:





    
        year
        Count
    
    
        1987
        5125
    
    
        2011
        1
    
    
        1998
        15858
    
    
        1990
        7258
    
    
        1991
        8650
    
    
        1950
        84
    

[6 rows x 2 columns]



In [24]:

    
# no of unique years
print "No of unique years :", str(len(year_count))









    



No of unique years : 90



In [25]:

    
print "no of invalid year count "
year_count.topk('year', reverse=True, k=1)









    



no of invalid year count 






    Out[25]:





    
        year
        Count
    
    
        0
        484424
    

[1 rows x 2 columns]



In [26]:

    
# year 0 is invalid value to better convert it to None
sdata['year'] = sdata['year'].apply(lambda x : None if x==0 else x)
sdata.head(5)









    Out[26]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
        title_length
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0.0
        2
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0.0
        2
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0.0
        4
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0.0
        3
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        None
        0.0
        3
    


    
        how_old_was_i
        col1
        col2
        no_of_a
    
    
        14
        2
        0
        5
    
    
        6
        2
        0
        8
    
    
        17
        4
        0
        1
    
    
        14
        3
        0
        3
    
    
        -1989
        3
        0
        2
    

[5 rows x 11 columns]



In [27]:

    
# now I have to fix the value of 'How_old_Was_i' column as well
# I can do this two ways, one - substract - year - born_year 
# or I can use the same apply function with NONE if x < 0
sdata['how_old_was_i'] = sdata['year'].apply(lambda x : None if x is None else x-my_age)
sdata.head(5)









    Out[27]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
        title_length
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0.0
        2
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0.0
        2
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0.0
        4
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0.0
        3
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        None
        0.0
        3
    


    
        how_old_was_i
        col1
        col2
        no_of_a
    
    
        14
        2
        0
        5
    
    
        6
        2
        0
        8
    
    
        17
        4
        0
        1
    
    
        14
        3
        0
        3
    
    
        None
        3
        0
        2
    

[5 rows x 11 columns]



In [28]:

    
# check the no of songs which have valid years
print len(sdata[sdata['year']>0])



In [29]:

    
tmp = sdata['year']>0
print tmp









    



[1L, 1L, 1L, 1L, None, None, None, 1L, None, None, 1L, 1L, None, 1L, None, 1L, 1L, 1L, 1L, None, None, None, 1L, 1L, None, None, 1L, 1L, None, None, None, None, None, 1L, 1L, 1L, None, 1L, None, 1L, None, 1L, None, 1L, 1L, 1L, None, 1L, None, None, None, None, 1L, 1L, 1L, None, None, 1L, None, None, 1L, 1L, None, 1L, 1L, 1L, None, None, None, 1L, None, None, 1L, None, 1L, 1L, None, None, None, None, 1L, None, None, 1L, 1L, None, 1L, None, 1L, None, None, None, 1L, 1L, 1L, 1L, None, 1L, 1L, 1L, ... ]



In [30]:

    
# Look at lots of descriptive statistics of title_length
print "mean: " + str(sdata['title_length'].mean())
print "std: " + str(sdata['title_length'].std())
print "var: " + str(sdata['title_length'].var())
print "min: " + str(sdata['title_length'].min())
print "max: " + str(sdata['title_length'].max())
print "sum: " + str(sdata['title_length'].sum())
print "number of non-zero entries: " + str(sdata['title_length'].nnz())









    



mean: 3.369894
std: 2.17226527587
var: 4.71873642876
min: 0
max: 47
sum: 3369894
number of non-zero entries: 999985



In [31]:

    
approx_sketch = sdata['title_length'].sketch_summary()
print approx_sketch









    



+--------------------+---------------+----------+
|        item        |     value     | is exact |
+--------------------+---------------+----------+
|       Length       |    1000000    |   Yes    |
|        Min         |      0.0      |   Yes    |
|        Max         |      47.0     |   Yes    |
|        Mean        |    3.369894   |   Yes    |
|        Sum         |   3369894.0   |   Yes    |
|      Variance      | 4.71873642876 |   Yes    |
| Standard Deviation | 2.17226527587 |   Yes    |
|  # Missing Values  |       0       |   Yes    |
|  # unique values   |       44      |    No    |
+--------------------+---------------+----------+

Most frequent items:
+-------+--------+--------+--------+--------+-------+-------+-------+-------+
| value |   2    |   3    |   1    |   4    |   5   |   6   |   7   |   8   |
+-------+--------+--------+--------+--------+-------+-------+-------+-------+
| count | 241587 | 217674 | 164124 | 152569 | 92148 | 54674 | 30777 | 17826 |
+-------+--------+--------+--------+--------+-------+-------+-------+-------+
+-------+------+
|   9   |  10  |
+-------+------+
| 10465 | 6633 |
+-------+------+

Quantiles: 
+-----+-----+-----+-----+-----+-----+-----+------+------+
|  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99%  | 100% |
+-----+-----+-----+-----+-----+-----+-----+------+------+
| 0.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 7.0 | 11.0 | 47.0 |
+-----+-----+-----+-----+-----+-----+-----+------+------+



In [32]:

    
# lets check which songs are having largest and smallest length
top_title_length = sdata.topk('title_length')
print top_title_length









    



+--------------------+-------------------------------+
|      song_id       |             title             |
+--------------------+-------------------------------+
| SOAALOO12AC468C4ED | Resolution Island Suite I)... |
| SOSEWOR12AB018BDF3 | Guayacan Mix: Amor Traicio... |
| SOEBPPZ12AB0183730 | Son Of Scheherazade: Pt.1-... |
| SODRVPW12A8C13DDCB | And by our own hand did ev... |
| SODEYRT12A8AE47972 | Any Place I Hang My Hat is... |
| SOEICJI12AC3DFAAD4 | Throw Away Comedy Medley: ... |
| SORHPYP12AB017C661 | Happenings' Medley: Oh! Th... |
| SOKNNWV12AB017B473 | If I Had My Way/Irish Rose... |
| SOWKFQF12AAA8C85E4 | Manitoba Ne Répond Plus (C... |
| SOGFUVA12A58A773AB | They Call Me Rock 'N' Roll... |
+--------------------+-------------------------------+
+-------------------------------+--------------------------+------+-----------+
|            release            |       artist_name        | year | my_rating |
+-------------------------------+--------------------------+------+-----------+
|      Allegory Of Hearing      |      Roy Montgomery      | None |    0.0    |
|        Como en un baile       |         Guayacan         | None |    0.0    |
|        British Tour '76       |       Renaissance        | None |    0.0    |
| Every Red Heart Shines Tow... |       Red Sparowes       | 2006 |    0.0    |
| Too Marvelous For Words - ... |       Lee Lessack        | None |    0.0    |
|       Live At The Sands       |       Dean Martin        | None |    0.0    |
| The Cat & The Fiddle - 66 ... | The Mother Goose Singers | None |    0.0    |
| 1999 International Barbers... |        Swing City        | None |    0.0    |
|    Manitoba Ne Répond Plus    |      Gérard Manset       | 2008 |    0.0    |
|     Motor City Connection     |   Brownsville Station    | 1975 |    0.0    |
+-------------------------------+--------------------------+------+-----------+
+--------------+---------------+------+------+---------+
| title_length | how_old_was_i | col1 | col2 | no_of_a |
+--------------+---------------+------+------+---------+
|      47      |      None     |  47  |  0   |    10   |
|      46      |      None     |  46  |  0   |    22   |
|      46      |      None     |  46  |  0   |    17   |
|      45      |       17      |  45  |  0   |    16   |
|      44      |      None     |  44  |  0   |    14   |
|      44      |      None     |  44  |  0   |    14   |
|      40      |      None     |  40  |  0   |    6    |
|      40      |      None     |  40  |  0   |    21   |
|      39      |       19      |  39  |  0   |    21   |
|      39      |      -14      |  39  |  0   |    9    |
+--------------+---------------+------+------+---------+
[10 rows x 11 columns]



In [33]:

    
# what about lowest
lowest_title_lenght = sdata.topk('title_length', reverse=True)
print lowest_title_lenght









    



+--------------------+-------+-------------------------------+----------------+
|      song_id       | title |            release            |  artist_name   |
+--------------------+-------+-------------------------------+----------------+
| SOVICLT12A58A7C4D0 |       |        Of Flies And Men       |     Milton     |
| SOAGRAA12AB018D567 |       |          Puce de luxe         | Sébastien Roch |
| SOAUTVB12AB018AFF0 |       |          Puce de luxe         | Sébastien Roch |
| SOMPGYE12AB018AFE0 |       |          Puce de luxe         | Sébastien Roch |
| SOVHAZH12AB018D5B3 |       |          Puce de luxe         | Sébastien Roch |
| SOECGBQ12AB018D582 |       |          Puce de luxe         | Sébastien Roch |
| SOFDILP12AC960A4EF |       | Recupera tus Clásicos - Fobia |     Fobia      |
| SOSFNKO12AB018D5BB |       |          Puce de luxe         | Sébastien Roch |
| SOZDBDL12AB018AFFF |       |          Puce de luxe         | Sébastien Roch |
| SOQUGMS12AB018B01D |       |          Puce de luxe         | Sébastien Roch |
+--------------------+-------+-------------------------------+----------------+
+------+-----------+--------------+---------------+------+------+---------+
| year | my_rating | title_length | how_old_was_i | col1 | col2 | no_of_a |
+------+-----------+--------------+---------------+------+------+---------+
| None |    0.0    |      0       |      None     |  0   |  0   |    0    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    3    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
+------+-----------+--------------+---------------+------+------+---------+
[10 rows x 11 columns]



In [34]:

    
# lowest 15
# what about lowest
lowest_title_lenght = sdata.topk('title_length', reverse=True, k =15)
print lowest_title_lenght









    



+--------------------+-------+---------------------+----------------+------+-----------+
|      song_id       | title |       release       |  artist_name   | year | my_rating |
+--------------------+-------+---------------------+----------------+------+-----------+
| SOVICLT12A58A7C4D0 |       |   Of Flies And Men  |     Milton     | None |    0.0    |
| SOAGRAA12AB018D567 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOOAFJX12AB018A028 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOLDTFD12AB018AFE6 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOAUTVB12AB018AFF0 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOMPGYE12AB018AFE0 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOVHAZH12AB018D5B3 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOECGBQ12AB018D582 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOTJWHR12AB018D5A0 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOBDDHR12AB01888A6 |       | Grand Glam Outtakes |     Remute     | None |    0.0    |
+--------------------+-------+---------------------+----------------+------+-----------+
+--------------+---------------+------+------+---------+
| title_length | how_old_was_i | col1 | col2 | no_of_a |
+--------------+---------------+------+------+---------+
|      0       |      None     |  0   |  0   |    0    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    3    |
+--------------+---------------+------+------+---------+
[15 rows x 11 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [35]:

    
before_i_was_born = sdata['how_old_was_i'] < 0
before_i_was_born.all(), before_i_was_born.any()









    Out[35]:





(False, True)



In [36]:

    
# get total songs in an album and display the top album by no
sdata.groupby(['artist_name','release'], {'no_of_songs_in_album':gl.aggregate.COUNT} ).topk('no_of_songs_in_album')









    Out[36]:





    
        artist_name
        release
        no_of_songs_in_album
    
    
        Fanny
        First Time In A Long
Time: The Reprise ...
        85
    
    
        Bernard Herrmann
        The Twilight Zone
        81
    
    
        Spanky & Our Gang
        The Complete Mercury
Recordings ...
        75
    
    
        The Smashing Pumpkins
        Rarities & B-Sides
        72
    
    
        Big Star
        Keep An Eye On The Sky
        71
    
    
        Jacques Dutronc
        Intégrale Les Cactus
        69
    
    
        The Stooges
        1970: The Complete Fun
House Sessions ...
        67
    
    
        Lull
        Moments
        64
    
    
        Willie Clancy
        Willie Clancy The Gold
Ring ...
        61
    
    
        Jack Dangers
        Forbidden Planet Explored
/ Sci-Fi Sound Effects ...
        60
    

[10 rows x 3 columns]



In [37]:

    
# this will download the 118 MB file
#usage_data = gl.SFrame.read_csv("https://static.turi.com/datasets/millionsong/10000.txt", header=False, delimiter='\t', column_type_hints={'X3':int})
#usage_data.rename({'X1':'user_id', 'X2':'song_id', 'X3':'listen_count'})



In [38]:

    
# Read the data
usage_data = gl.SFrame.read_csv("C:/learn/ML/ML00caseStudy/week01Intro/10000.txt", header=False, delimiter="\t", column_type_hints={'X3':int})
usage_data.rename({'X1':'user_id', 'X2':'song_id', 'X3':'listen_count'})









    




Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\10000.txt






    




Parsing completed. Parsed 100 lines in 3.04661 secs.






    




Read 844838 lines. Lines per second: 265452






    




Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\10000.txt






    




Parsing completed. Parsed 2000000 lines in 4.35327 secs.






    Out[38]:





    
        user_id
        song_id
        listen_count
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOAKIMP12A8C130995
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOBBMDR12A8C13253B
        2
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOBXHDL12A81C204C0
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOBYHAJ12A6701BF1D
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SODACBL12A8C13C273
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SODDNQT12A6D4F5F7E
        5
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SODXRTY12AB0180F3B
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOFGUAY12AB017B0A8
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOFRQTD12A81C233C0
        1
    
    
        b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
        SOHQWYZ12A6D4FA701
        1
    

[2000000 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [39]:

    
# saving this data frame
usage_data.save('usage_data')
# loading  the data
usage_data = gl.load_sframe('usage_data')



In [40]:

    
# find out the unique users
print len(usage_data['user_id'].unique())



In [41]:

    
# let's create two datasets which we can join
ds1 = sdata[((sdata['artist_name'] == 'Relient K')
                           | (sdata['artist_name'] == 'Streetlight Manifesto'))
                          & (sdata['how_old_was_i'] >= 14) & (sdata['how_old_was_i'] <= 18)]
ds1









    Out[41]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQNUHJ12A6D4F9E19
        Let It All Out (Album
Version) ...
        MMHMM
        Relient K
        2004
        0.0
    
    
        SOADZOM12A8C13E446
        The Blonde Lead The Blind
(Album Version) ...
        Somewhere In The Between
        Streetlight Manifesto
        2007
        0.0
    
    
        SOKUTMT12A58A77CFB
        Give Until There's
Nothing Left ...
        Five Score And Seven
Years Ago ...
        Relient K
        2007
        0.0
    
    
        SOIONAH12A58A76FD1
        We Are The Few (Album
Version) ...
        Everything Goes Numb
        Streetlight Manifesto
        2003
        0.0
    
    
        SOUUYHK12A6D4F9E16
        I So Hate Consequences
(Album Version) ...
        MMHMM
        Relient K
        2004
        0.0
    
    
        SOCBUJT12A6D4F9E1A
        Who I Am Hates Who I've
Been (mmhmm Album ...
        MMHMM
        Relient K
        2004
        0.0
    
    
        SOBUHJR12A6D4FDC7C
        Here's To Life (Album
Version) ...
        Everything Goes Numb
        Streetlight Manifesto
        2003
        0.0
    
    
        SONKBWG12A6D4FB91D
        Giving Up_ Giving In (LP
Version) ...
        Keasbey Nights
        Streetlight Manifesto
        2006
        0.0
    
    
        SOSFAVU12A6D4FDC6A
        Everything Went Numb
(Album Version) ...
        Everything Goes Numb
        Streetlight Manifesto
        2003
        0.0
    
    
        SORYDMW12A6D4FB923
        This One Goes Out To....
(LP Version) ...
        Keasbey Nights
        Streetlight Manifesto
        2006
        0.0
    


    
        title_length
        how_old_was_i
        col1
        col2
        no_of_a
    
    
        6
        15
        6
        0
        0
    
    
        7
        18
        7
        0
        2
    
    
        5
        18
        5
        0
        1
    
    
        6
        14
        6
        0
        1
    
    
        6
        15
        6
        0
        1
    
    
        10
        15
        10
        0
        1
    
    
        5
        14
        5
        0
        1
    
    
        6
        17
        6
        0
        2
    
    
        5
        14
        5
        0
        1
    
    
        7
        17
        7
        0
        2
    

[? rows x 11 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.



In [42]:

    
# Let's join ds1 with the 10000.txt dataset usage_data
dsjoin = ds1.join(usage_data, 'song_id')
dsjoin









    Out[42]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOMMLRO12A6D4F9FD1
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
        Relient K
        2007
        0.0
    
    
        SOCLCYG12A6D4FDC71
        Point/Counterpoint (Album
Version) ...
        Everything Goes Numb
        Streetlight Manifesto
        2003
        0.0
    
    
        SOINPKF12A6D4FDC75
        A Better Place_ A Better
Time (Album Version) ...
        Everything Goes Numb
        Streetlight Manifesto
        2003
        0.0
    
    
        SOMMLRO12A6D4F9FD1
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
        Relient K
        2007
        0.0
    
    
        SOMMLRO12A6D4F9FD1
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
        Relient K
        2007
        0.0
    
    
        SOMMLRO12A6D4F9FD1
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
        Relient K
        2007
        0.0
    
    
        SOPSQOS12A6D4F9E15
        High Of 75 (Album
Version) ...
        MMHMM
        Relient K
        2004
        0.0
    
    
        SOSTGAF12A58A7B18B
        The Best Thing
        Five Score And Seven
Years Ago ...
        Relient K
        2007
        0.0
    
    
        SOUUYHK12A6D4F9E16
        I So Hate Consequences
(Album Version) ...
        MMHMM
        Relient K
        2004
        0.0
    
    
        SOMMLRO12A6D4F9FD1
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
        Relient K
        2007
        0.0
    


    
        title_length
        how_old_was_i
        col1
        col2
        no_of_a
        user_id
        listen_count
    
    
        5
        18
        5
        0
        2
        3f9ed694a79835c921ef6d94a
cd28f876c1d901e ...
        3
    
    
        3
        14
        3
        0
        1
        956044d724390e40c8511b49e
5bf6bc28071de3a ...
        1
    
    
        8
        14
        8
        0
        2
        956044d724390e40c8511b49e
5bf6bc28071de3a ...
        1
    
    
        5
        18
        5
        0
        2
        354cfdb566f543bb5b810a4d8
959d974a30797fd ...
        1
    
    
        5
        18
        5
        0
        2
        e4d4e50c99ed5b3dfe740fa1c
cbe6be41eeb4f35 ...
        14
    
    
        5
        18
        5
        0
        2
        ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
        3
    
    
        5
        15
        5
        0
        0
        ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
        2
    
    
        3
        18
        3
        0
        1
        ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
        1
    
    
        6
        15
        6
        0
        1
        ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
        1
    
    
        5
        18
        5
        0
        2
        0e741cfe121bb619177be8d8a
135a2d3692d9c90 ...
        4
    

[571 rows x 13 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [43]:

    
# total row in ds1 and dsjoin datasets
len(ds1), len(dsjoin)









    Out[43]:





(64, 571)



In [44]:

    
len(ds1['song_id'].unique()), len(dsjoin['song_id'].unique()), len(ds1['song_id']), len(dsjoin['song_id'])









    Out[44]:





(64, 7, 64, 571)



In [45]:

    
# find out most popular songs when I was between 14 n 18
most_popular = dsjoin.groupby(['song_id'], {'total_listen_count':gl.aggregate.SUM('listen_count'), 
                                             'num_unique_users':gl.aggregate.COUNT('user_id')})
most_popular









    Out[45]:





    
        song_id
        num_unique_users
        total_listen_count
    
    
        SOCVOVH12A6D4FB912
        107
        555
    
    
        SOPSQOS12A6D4F9E15
        79
        207
    
    
        SOMMLRO12A6D4F9FD1
        104
        331
    
    
        SOUUYHK12A6D4F9E16
        79
        171
    
    
        SOINPKF12A6D4FDC75
        63
        196
    
    
        SOSTGAF12A58A7B18B
        72
        168
    
    
        SOCLCYG12A6D4FDC71
        67
        188
    

[7 rows x 3 columns]



In [46]:

    
# to get artist name we have to join this data
most_popular.join(sdata, 'song_id').topk('total_listen_count',k=20)









    Out[46]:





    
        song_id
        num_unique_users
        total_listen_count
        title
        release
    
    
        SOCVOVH12A6D4FB912
        107
        555
        Keasbey Nights (LP
Version) ...
        Keasbey Nights
    
    
        SOMMLRO12A6D4F9FD1
        104
        331
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
    
    
        SOPSQOS12A6D4F9E15
        79
        207
        High Of 75 (Album
Version) ...
        MMHMM
    
    
        SOINPKF12A6D4FDC75
        63
        196
        A Better Place_ A Better
Time (Album Version) ...
        Everything Goes Numb
    
    
        SOCLCYG12A6D4FDC71
        67
        188
        Point/Counterpoint (Album
Version) ...
        Everything Goes Numb
    
    
        SOUUYHK12A6D4F9E16
        79
        171
        I So Hate Consequences
(Album Version) ...
        MMHMM
    
    
        SOSTGAF12A58A7B18B
        72
        168
        The Best Thing
        Five Score And Seven
Years Ago ...
    


    
        artist_name
        year
        my_rating
        title_length
        how_old_was_i
        col1
        col2
        no_of_a
    
    
        Streetlight Manifesto
        2006
        0.0
        4
        17
        4
        0
        3
    
    
        Relient K
        2007
        0.0
        5
        18
        5
        0
        2
    
    
        Relient K
        2004
        0.0
        5
        15
        5
        0
        0
    
    
        Streetlight Manifesto
        2003
        0.0
        8
        14
        8
        0
        2
    
    
        Streetlight Manifesto
        2003
        0.0
        3
        14
        3
        0
        1
    
    
        Relient K
        2004
        0.0
        6
        15
        6
        0
        1
    
    
        Relient K
        2007
        0.0
        3
        18
        3
        0
        1
    

[7 rows x 13 columns]



In [47]:

    
# let's append a row with max liste count and check whether it comes in above result or not
me = gl.SFrame({'user_id':['evan'],'song_id':['SOSFAVU12A6D4FDC6A'],'listen_count':[4000]})
# adding this data to usage data
usage_data = usage_data.append(me)



In [48]:

    
# repeating the above join n group by statement
dsjoin = ds1.join(usage_data, 'song_id')
most_popular = dsjoin.groupby(['song_id'], {'total_listen_count':gl.aggregate.SUM('listen_count'), 
                                             'num_unique_users':gl.aggregate.COUNT('user_id')})
most_popular.join(sdata, 'song_id').topk('total_listen_count',k=20)









    Out[48]:





    
        song_id
        num_unique_users
        total_listen_count
        title
        release
    
    
        SOSFAVU12A6D4FDC6A
        1
        4000
        Everything Went Numb
(Album Version) ...
        Everything Goes Numb
    
    
        SOCVOVH12A6D4FB912
        107
        555
        Keasbey Nights (LP
Version) ...
        Keasbey Nights
    
    
        SOMMLRO12A6D4F9FD1
        104
        331
        Must Have Done Something
Right ...
        Must Have Done Something
Right ...
    
    
        SOPSQOS12A6D4F9E15
        79
        207
        High Of 75 (Album
Version) ...
        MMHMM
    
    
        SOINPKF12A6D4FDC75
        63
        196
        A Better Place_ A Better
Time (Album Version) ...
        Everything Goes Numb
    
    
        SOCLCYG12A6D4FDC71
        67
        188
        Point/Counterpoint (Album
Version) ...
        Everything Goes Numb
    
    
        SOUUYHK12A6D4F9E16
        79
        171
        I So Hate Consequences
(Album Version) ...
        MMHMM
    
    
        SOSTGAF12A58A7B18B
        72
        168
        The Best Thing
        Five Score And Seven
Years Ago ...
    


    
        artist_name
        year
        my_rating
        title_length
        how_old_was_i
        col1
        col2
        no_of_a
    
    
        Streetlight Manifesto
        2003
        0.0
        5
        14
        5
        0
        1
    
    
        Streetlight Manifesto
        2006
        0.0
        4
        17
        4
        0
        3
    
    
        Relient K
        2007
        0.0
        5
        18
        5
        0
        2
    
    
        Relient K
        2004
        0.0
        5
        15
        5
        0
        0
    
    
        Streetlight Manifesto
        2003
        0.0
        8
        14
        8
        0
        2
    
    
        Streetlight Manifesto
        2003
        0.0
        3
        14
        3
        0
        1
    
    
        Relient K
        2004
        0.0
        6
        15
        6
        0
        1
    
    
        Relient K
        2007
        0.0
        3
        18
        3
        0
        1
    

[8 rows x 13 columns]

Splitting and Sampling

Lets check, how we can randomly split the data for test



In [49]:

    
# Randomly split data rows into two subsets
first_set, second_set = sdata.random_split(0.8, seed = 1)
first_set.num_rows(), second_set.num_rows()









    Out[49]:





(800135, 199865)

If you want to split on a predicate though, you'll have to do that manually.



In [50]:

    
songs_before = sdata[sdata['how_old_was_i'] < 0]
songs_after = sdata[sdata['how_old_was_i'] >= 0]
songs_before.num_rows(), songs_after.num_rows()









    Out[50]:





(75581, 439995)



In [51]:

    
# generating sample data
sample = sdata.sample(0.4)
sample.num_rows()









    Out[51]:





399454

SArray



In [52]:

    
arr = gl.SArray([1,2,3])
arr









    Out[52]:





dtype: int
Rows: 3
[1L, 2L, 3L]



In [53]:

    
arr2 = 2*arr
arr2









    Out[53]:





dtype: int
Rows: 3
[2L, 4L, 6L]



In [54]:

    
# add
arr + arr2









    Out[54]:





dtype: int
Rows: 3
[3L, 6L, 9L]



In [55]:

    
# multiply
arr * arr2









    Out[55]:





dtype: int
Rows: 3
[2L, 8L, 18L]



In [56]:

    
# divide
arr2 / arr









    Out[56]:





dtype: float
Rows: 3
[2.0, 2.0, 2.0]



In [57]:

    
# iterating with SFrame
for i in sdata:
    if i['title_length'] >= 45:
        print "Whoa that's long!"









    



Whoa that's long!
Whoa that's long!
Whoa that's long!
Whoa that's long!

Using apply function on SFrame



In [58]:

    
sdata['title_artist_length'] = sdata['title','artist_name'].apply(lambda row: sum([len(col) for col in row.values()]))
sdata









    Out[58]:





    
        song_id
        title
        release
        artist_name
        year
        my_rating
    
    
        SOQMMHC12AB0180CB8
        Silent Night
        Monster Ballads X-Mas
        Faster Pussy cat
        2003
        0.0
    
    
        SOVFVAK12A8C1350D9
        Tanssi vaan
        Karkuteillä
        Karkkiautomaatti
        1995
        0.0
    
    
        SOGTUKN12AB017F4F1
        No One Could Ever
        Butter
        Hudson Mohawke
        2006
        0.0
    
    
        SOBNYVR12A8C13558C
        Si Vos Querés
        De Culo
        Yerba Brava
        2003
        0.0
    
    
        SOHSBXH12A8C13B0DF
        Tangle Of Aspens
        Rene Ablaze Presents
Winter Sessions ...
        Der Mystic
        None
        0.0
    
    
        SOZVAPQ12A8C13B63C
        Symphony No. 1 G minor
"Sinfonie ...
        Berwald: Symphonies Nos.
1/2/3/4 ...
        David Montgomery
        None
        0.0
    
    
        SOQVRHI12A6D4FB2D7
        We Have Got Love
        Strictly The Best Vol. 34
        Sasha / Turbulence
        None
        0.0
    
    
        SOEYRFT12AB018936C
        2 Da Beat Ch'yall
        Da Bomb
        Kris Kross
        1993
        0.0
    
    
        SOPMIYT12A6D4F851E
        Goodbye
        Danny Boy
        Joseph Locke
        None
        0.0
    
    
        SOJCFMH12A8C13B0C2
        Mama_ mama can't you see
? ...
        March to cadence with the
US marines ...
        The Sun Harbor's Chorus-
Documentary Recordings ...
        None
        0.0
    


    
        title_length
        how_old_was_i
        col1
        col2
        no_of_a
        title_artist_length
    
    
        2
        14
        2
        0
        5
        28
    
    
        2
        6
        2
        0
        8
        27
    
    
        4
        17
        4
        0
        1
        31
    
    
        3
        14
        3
        0
        3
        25
    
    
        3
        None
        3
        0
        2
        26
    
    
        9
        None
        9
        0
        3
        78
    
    
        4
        None
        4
        0
        3
        34
    
    
        4
        4
        4
        0
        4
        27
    
    
        1
        None
        1
        0
        1
        19
    
    
        6
        None
        6
        0
        10
        72
    

[1000000 rows x 12 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Saving Our Work



In [59]:

    
# save as csv
sdata.save('sdata_new.csv', format='csv')



In [60]:

    
sdata.save('sdata_new')



In [ ]:

song_id	title	release	artist_name	year
SOQMMHC12AB0180CB8	Silent Night	Monster Ballads X-Mas	Faster Pussy cat	2003
SOVFVAK12A8C1350D9	Tanssi vaan	Karkuteillä	Karkkiautomaatti	1995
SOGTUKN12AB017F4F1	No One Could Ever	Butter	Hudson Mohawke	2006
SOBNYVR12A8C13558C	Si Vos Querés	De Culo	Yerba Brava	2003
SOHSBXH12A8C13B0DF	Tangle Of Aspens	Rene Ablaze Presents Winter Sessions ...	Der Mystic	0

song_id	title	release	artist_name	year
SOXQYIQ12A8C137FBB	Jago Chhadeo	Naale Baba Lassi Pee Gya	Kuldeep Manak	0
SOHODZI12A8C137BB3	Novemba	Dub_Connected: electronic music ...	Gabriel Le Mar	0
SOLXGOR12A81C21EB7	Faraday	The Trance Collection Vol. 2 ...	Elude	0
SOWXJXQ12AB0189F43	Fernweh feat. Sektion Kuchikäschtli ...	So Oder So	Texta	2004

song_age	title length	how_old_was_i
14	2	14
6	2	6
17	4	17
14	3	14
-1989	3	-1989
-1989	9	-1989
-1989	4	-1989
4	4	4
-1989	1	-1989
-1989	6	-1989

artist_name	release	no_of_songs_in_album
Fanny	First Time In A Long Time: The Reprise ...	85
Bernard Herrmann	The Twilight Zone	81
Spanky & Our Gang	The Complete Mercury Recordings ...	75
The Smashing Pumpkins	Rarities & B-Sides	72
Big Star	Keep An Eye On The Sky	71
Jacques Dutronc	Intégrale Les Cactus	69
The Stooges	1970: The Complete Fun House Sessions ...	67
Lull	Moments	64
Willie Clancy	Willie Clancy The Gold Ring ...	61
Jack Dangers	Forbidden Planet Explored / Sci-Fi Sound Effects ...	60

user_id	song_id	listen_count
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOAKIMP12A8C130995	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOBBMDR12A8C13253B	2
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOBXHDL12A81C204C0	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOBYHAJ12A6701BF1D	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SODACBL12A8C13C273	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SODDNQT12A6D4F5F7E	5
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SODXRTY12AB0180F3B	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOFGUAY12AB017B0A8	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOFRQTD12A81C233C0	1
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...	SOHQWYZ12A6D4FA701	1

song_id	title	release	artist_name	year
SOQNUHJ12A6D4F9E19	Let It All Out (Album Version) ...	MMHMM	Relient K	2004
SOADZOM12A8C13E446	The Blonde Lead The Blind (Album Version) ...	Somewhere In The Between	Streetlight Manifesto	2007
SOKUTMT12A58A77CFB	Give Until There's Nothing Left ...	Five Score And Seven Years Ago ...	Relient K	2007
SOIONAH12A58A76FD1	We Are The Few (Album Version) ...	Everything Goes Numb	Streetlight Manifesto	2003
SOUUYHK12A6D4F9E16	I So Hate Consequences (Album Version) ...	MMHMM	Relient K	2004
SOCBUJT12A6D4F9E1A	Who I Am Hates Who I've Been (mmhmm Album ...	MMHMM	Relient K	2004
SOBUHJR12A6D4FDC7C	Here's To Life (Album Version) ...	Everything Goes Numb	Streetlight Manifesto	2003
SONKBWG12A6D4FB91D	Giving Up_ Giving In (LP Version) ...	Keasbey Nights	Streetlight Manifesto	2006
SOSFAVU12A6D4FDC6A	Everything Went Numb (Album Version) ...	Everything Goes Numb	Streetlight Manifesto	2003
SORYDMW12A6D4FB923	This One Goes Out To.... (LP Version) ...	Keasbey Nights	Streetlight Manifesto	2006

title_length	how_old_was_i	col1	no_of_a
6	15	6	0
7	18	7	2
5	18	5	1
6	14	6	1
6	15	6	1
10	15	10	1
5	14	5	1
6	17	6	2
5	14	5	1
7	17	7	2

song_id	title	release	artist_name	year
SOMMLRO12A6D4F9FD1	Must Have Done Something Right ...	Must Have Done Something Right ...	Relient K	2007
SOCLCYG12A6D4FDC71	Point/Counterpoint (Album Version) ...	Everything Goes Numb	Streetlight Manifesto	2003
SOINPKF12A6D4FDC75	A Better Place_ A Better Time (Album Version) ...	Everything Goes Numb	Streetlight Manifesto	2003
SOMMLRO12A6D4F9FD1	Must Have Done Something Right ...	Must Have Done Something Right ...	Relient K	2007
SOMMLRO12A6D4F9FD1	Must Have Done Something Right ...	Must Have Done Something Right ...	Relient K	2007
SOMMLRO12A6D4F9FD1	Must Have Done Something Right ...	Must Have Done Something Right ...	Relient K	2007
SOPSQOS12A6D4F9E15	High Of 75 (Album Version) ...	MMHMM	Relient K	2004
SOSTGAF12A58A7B18B	The Best Thing	Five Score And Seven Years Ago ...	Relient K	2007
SOUUYHK12A6D4F9E16	I So Hate Consequences (Album Version) ...	MMHMM	Relient K	2004
SOMMLRO12A6D4F9FD1	Must Have Done Something Right ...	Must Have Done Something Right ...	Relient K	2007

title_length	how_old_was_i	col1	no_of_a	user_id	listen_count
5	18	5	2	3f9ed694a79835c921ef6d94a cd28f876c1d901e ...	3
3	14	3	1	956044d724390e40c8511b49e 5bf6bc28071de3a ...	1
8	14	8	2	956044d724390e40c8511b49e 5bf6bc28071de3a ...	1
5	18	5	2	354cfdb566f543bb5b810a4d8 959d974a30797fd ...	1
5	18	5	2	e4d4e50c99ed5b3dfe740fa1c cbe6be41eeb4f35 ...	14
5	18	5	2	ed0f4979ffddc0010244638a3 b956510624dc6a9 ...	3
5	15	5	0	ed0f4979ffddc0010244638a3 b956510624dc6a9 ...	2
3	18	3	1	ed0f4979ffddc0010244638a3 b956510624dc6a9 ...	1
6	15	6	1	ed0f4979ffddc0010244638a3 b956510624dc6a9 ...	1
5	18	5	2	0e741cfe121bb619177be8d8a 135a2d3692d9c90 ...	4

song_id	num_unique_users	total_listen_count
SOCVOVH12A6D4FB912	107	555
SOPSQOS12A6D4F9E15	79	207
SOMMLRO12A6D4F9FD1	104	331
SOUUYHK12A6D4F9E16	79	171
SOINPKF12A6D4FDC75	63	196
SOSTGAF12A58A7B18B	72	168
SOCLCYG12A6D4FDC71	67	188

title_length	how_old_was_i	col1	no_of_a
6	15	6	0
7	18	7	2
5	18	5	1
6	14	6	1
6	15	6	1
10	15	10	1
5	14	5	1
6	17	6	2
5	14	5	1
7	17	7	2

song_id	num_unique_users	total_listen_count	title	release
SOCVOVH12A6D4FB912	107	555	Keasbey Nights (LP Version) ...	Keasbey Nights
SOMMLRO12A6D4F9FD1	104	331	Must Have Done Something Right ...	Must Have Done Something Right ...
SOPSQOS12A6D4F9E15	79	207	High Of 75 (Album Version) ...	MMHMM
SOINPKF12A6D4FDC75	63	196	A Better Place_ A Better Time (Album Version) ...	Everything Goes Numb
SOCLCYG12A6D4FDC71	67	188	Point/Counterpoint (Album Version) ...	Everything Goes Numb
SOUUYHK12A6D4F9E16	79	171	I So Hate Consequences (Album Version) ...	MMHMM
SOSTGAF12A58A7B18B	72	168	The Best Thing	Five Score And Seven Years Ago ...

song_id	num_unique_users	total_listen_count	title	release
SOSFAVU12A6D4FDC6A	1	4000	Everything Went Numb (Album Version) ...	Everything Goes Numb
SOCVOVH12A6D4FB912	107	555	Keasbey Nights (LP Version) ...	Keasbey Nights
SOMMLRO12A6D4F9FD1	104	331	Must Have Done Something Right ...	Must Have Done Something Right ...
SOPSQOS12A6D4F9E15	79	207	High Of 75 (Album Version) ...	MMHMM
SOINPKF12A6D4FDC75	63	196	A Better Place_ A Better Time (Album Version) ...	Everything Goes Numb
SOCLCYG12A6D4FDC71	67	188	Point/Counterpoint (Album Version) ...	Everything Goes Numb
SOUUYHK12A6D4F9E16	79	171	I So Hate Consequences (Album Version) ...	MMHMM
SOSTGAF12A58A7B18B	72	168	The Best Thing	Five Score And Seven Years Ago ...

title_length	how_old_was_i	col1	no_of_a	title_artist_length
2	14	2	5	28
2	6	2	8	27
4	17	4	1	31
3	14	3	3	25
3	None	3	2	26
9	None	9	3	78
4	None	4	3	34
4	4	4	4	27
1	None	1	1	19
6	None	6	10	72

title_length	how_old_was_i	col1	no_of_a
6	15	6	0
7	18	7	2
5	18	5	1
6	14	6	1
6	15	6	1
10	15	10	1
5	14	5	1
6	17	6	2
5	14	5	1
7	17	7	2