Exercice:

Take countries_GDP.csv and convert it to a valid csv file without header.


In [148]:
import pandas as pd
import numpy as np

#read csv as data frame
df_gdp_raw = pd.read_csv("../data/countries_GDP.csv")
#select columns and use these that have data in 'Unamed:0', which
#actually is the country code
df_gdp = df_gdp_raw[[0,1,3,4]][df_gdp_raw['Unnamed: 0'].notnull()]
#rename columns and index
df_gdp.columns=["country_code","position","country_name","gdp"]
df_gdp.index = range(df_gdp.shape[0])
#show head
print df_gdp.head()
#show types, take into account that gdp should be integer
print df_gdp.dtypes
#change gdp dtype to numeric
df_gdp.gdp = df_gdp.gdp.apply(lambda x: x.replace(",","").strip(" "))
df_gdp.gdp = pd.to_numeric(df_gdp.gdp,errors="coerce")
print df_gdp.gdp.dtype
print df_gdp.head()
print df_gdp.tail()
#save as csv, set header as false
df_gdp.to_csv("../data/countries_GDP_clean.csv",header=False,sep=";")


  country_code position    country_name           gdp
0          USA        1   United States   17,946,996 
1          CHN        2           China   10,866,444 
2          JPN        3           Japan    4,123,258 
3          DEU        4         Germany    3,355,772 
4          GBR        5  United Kingdom    2,848,755 
country_code    object
position        object
country_name    object
gdp             object
dtype: object
float64
  country_code position    country_name         gdp
0          USA        1   United States  17946996.0
1          CHN        2           China  10866444.0
2          JPN        3           Japan   4123258.0
3          DEU        4         Germany   3355772.0
4          GBR        5  United Kingdom   2848755.0
    country_code position         country_name         gdp
224          SSF      NaN   Sub-Saharan Africa   1572873.0
225          LIC      NaN           Low income    392904.0
226          LMC      NaN  Lower middle income   5820363.0
227          UMC      NaN  Upper middle income  19732884.0
228          HIC      NaN          High income  46985247.0

Exercice:

Clean countries data and save it as a valid csv without header.


In [68]:
import pandas as pd
import numpy as np

df_country_raw = pd.read_csv("../data/countries_data.csv",sep=";")
df_country_raw.head(15)
df_country_raw.to_csv("../data/countries_data_clean.csv",header=False)

Exercice:

Build a function that generates a dataframe with N user id plus a list of a random number of random news topics from news_topics.csv


In [299]:
import pandas as pd
import numpy as np


def generate_users_df(num_users, num_topics):
    #generate num_users usernames
    usernames_df = pd.Series(["user"]*num_users).str.cat(pd.Series(np.arange(num_users)).map(str))

    #read topics csv
    news_topics = pd.read_csv("../data/news_topics.csv",header=None)
    #generate a list of N int picked uniformly random from range 0 .. num_topics
    #WARNING: is really an uniform distribution??
    rand_ints = pd.Series(np.random.randint(1,num_topics+1,num_users))

    #WARNING: what happens if x>len(news_topics)
    topics_df = rand_ints.apply(lambda x: "|".join(np.random.choice(news_topics.T[0],x,replace=False)))

    return pd.concat({'username':usernames_df,'topics':topics_df},axis=1)
    
M = 5
N = 100
users_df = generate_users_df(N,M)
users_df.head(10)


Out[299]:
topics username
0 world|healthcare|comedians|directors user0
1 economists|studios|video games user1
2 automotive|food|movies user2
3 reporters user3
4 us|video games|environment|banks|basketball user4
5 books|ncaa|theatre user5
6 grammys|horses|sportscasters|immigration|children user6
7 elections|banks|immigration user7
8 energy|artists user8
9 figure skating|energy user9

Exercice:

Save the info generated with the previous function as csv so that it can be easily loaded as a Pair RDD in pyspark.


In [300]:
import csv 

M = 20
N = 1000
users_df = generate_users_df(N,M)

users_df.to_csv("../data/users_events_example/user_info_%susers_%stopics.csv" % (N,M),
                columns=["username","topics"],
                header=None, 
                index=None)
                #quoting=csv.QUOTE_MINIMAL)

Exercice:

Build a function that generates N csv files containing user's web browsing information. This function takes a max number of users M (from user0 to userM) and generates K user information logs for a randomly picked user (with repetition). The function will return this information with a timestamp. Each file represents 5 minute activity, the activity period will be K/300. The activity information is a random selection of 1 element over news topics.


In [301]:
import datetime

def generate_user_events(date_start, num_files, num_users, num_events):
    #generate usernames
    usernames_df = pd.Series(["user"]*num_users).str.cat(pd.Series(np.arange(num_users)).map(str))
    #read topics
    news_topics = pd.read_csv("../data/news_topics.csv",header=None,lineterminator="\n").T
    #create time index
    df_index = pd.date_range(date_start, 
                             periods=num_events, 
                             freq=pd.DateOffset(seconds=float(5*60)/num_events))
    #generate data
    event_data = {"user" : np.random.choice(usernames_df,num_events,replace=True),
                  "event" : np.random.choice(news_topics[0],num_events,replace=True)}
    #generate df
    return pd.DataFrame(event_data, index = df_index, columns=["user", "event"])

num_files = 10
num_users = 100
num_events = 1000
date_start = datetime.datetime.strptime('1/1/2016', '%d/%m/%Y')

for idx,i in enumerate(range(num_files)):
    print "File ",idx+1," of ", num_files, " at ",date_start
    userevent_df = generate_user_events(date_start, num_files, num_users, num_events)
    file_name = "../data/users_events_example/userevents_" + date_start.strftime("%d%m%Y%H%M%S") + ".log"
    userevent_df.to_csv(file_name, header=None)
    date_start = date_start + datetime.timedelta(0,300)


File  1  of  10  at  2016-01-01 00:00:00
File  2  of  10  at  2016-01-01 00:05:00
File  3  of  10  at  2016-01-01 00:10:00
File  4  of  10  at  2016-01-01 00:15:00
File  5  of  10  at  2016-01-01 00:20:00
File  6  of  10  at  2016-01-01 00:25:00
File  7  of  10  at  2016-01-01 00:30:00
File  8  of  10  at  2016-01-01 00:35:00
File  9  of  10  at  2016-01-01 00:40:00
File  10  of  10  at  2016-01-01 00:45:00

Exercice:

Generate a unique id for papers.lst file and save it as a csv file. Then generate a csv file with random references among papers from papers.lst.


In [66]:
import csv, re
import pandas as pd
import numpy as np

f = file("../data/papers.lst","rb")
papers = []
for idx,l in enumerate(f.readlines()):
    t = re.match("(\d+)(\s*)(.\d*)(\s*)(\w+)(\s*)(.*)",l)
    if t:
        #print "|",t.group(1),"|",t.group(3),"|",t.group(5),"|",t.group(7),"|"
        papers.append([t.group(1),t.group(3),t.group(5),t.group(7)])
            
papers_df = pd.DataFrame(papers)
papers_df.to_csv("../data/papers.csv", header = None)

N = papers_df.shape[0]
#let's assume that a paper can have 30 references at max and 5 at min
M = 30

papers_references = pd.DataFrame(np.arange(N))
papers_references[1] = papers_references[0].apply(
    lambda x: 
        ";".join(
            [str(x) for x in np.random.choice(papers_references[0],np.random.randint(5,M))]))
papers_references.columns = ["paper_id","references"]
papers_references.to_csv("../data/papers_references.csv",header=None,index=None)

Exercice:

Read "../data/country_info_worldbank.xls" and delete wrong rows and set proper column names.


In [147]:
import pandas as pd

cc_df0 = pd.read_excel("../data/country_info_worldbank.xls")
#delete unnececary rows
cc_df1 = cc_df0[cc_df0["Unnamed: 2"].notnull()]
#get columnames and set to dataframe
colnames = cc_df1.iloc[0].tolist()
colnames[0] = "Order"
cc_df1.columns = colnames
#delete void columns
cc_df2 = cc_df1.loc[:,cc_df1.iloc[1].notnull()]
#delete first row as it is colnames
cc_df3 = cc_df2.iloc[1:]
#reindex
cc_df3.index = np.arange(cc_df3.shape[0])
cc_df3[:]["Economy"] = cc_df3.Economy.str.encode('utf-8')
cc_df3.to_csv("../data/worldbank_countrycodes_clean.csv")

Exercice:

Convert lat lon to UTM


In [3]:
import pandas as pd

est_df = pd.read_csv("../data/estacions_meteo.tsv",sep="\t")
est_df.head()
est_df.columns = est_df.columns.str.lower().\
                    str.replace("\[codi\]","").\
                    str.replace("\(m\)","").str.strip()
est_df.longitud = est_df.longitud.str.replace(",",".")
est_df.latitud = est_df.latitud.str.replace(",",".")

est_df.longitud = pd.to_numeric(est_df.longitud)
est_df.latitud = pd.to_numeric(est_df.latitud)

Exercice:

Convert to Technically Correct Data: iqsize.csv


In [1]:
import pandas as pd

df = pd.read_csv("../data/iqsize.csv", na_values="n/a")

df.dtypes

#clean piq
errors = pd.to_numeric(df.piq, errors="coerce")
print df["piq"][errors.isnull()]
df["piq"] = pd.to_numeric(df["piq"].str.replace("'","."))
df.dtypes

errors = pd.to_numeric(df.height, errors="coerce")
print df["height"][errors.isnull()]
df["height"] = pd.to_numeric(df["height"].str.replace("'","."))
df.dtypes

df.sex.unique()
df.sex = df.sex.str.replace("Woman","Female")
df.sex = df.sex.str.replace("woman","Female")
df.sex = df.sex.str.replace("woman","Female")
df.sex = df.sex.str.replace("man","Male")
df.sex = df.sex.str.replace("Man","Male")
df.sex.unique()

df.to_csv("../data/iqsize_clean.csv",index=None)


  File "<ipython-input-1-f8d8a6a45710>", line 9
    print df["piq"][errors.isnull()]
           ^
SyntaxError: invalid syntax

In [29]:
df = pd.read_csv("../data/iqsize_clean.csv")
df


Out[29]:
id piq brain height weight sex
0 0 124.0 81.69 64.5 118 Female
1 1 150.0 103.84 73.3 143 Male
2 2 128.0 96.54 68.8 172 Female
3 3 134.0 95.15 65.0 147 Male
4 4 110.0 92.88 69.0 146 Male
5 5 131.0 99.13 64.5 138 Male
6 6 98.0 85.43 66.0 175 Female
7 7 84.0 90.49 66.3 134 Male
8 8 147.0 95.55 68.8 172 Female
9 9 124.0 83.39 64.5 118 Male
10 10 128.0 107.95 70.0 151 Female
11 11 124.0 92.41 69.0 155 Male
12 12 147.0 85.65 70.5 155 Female
13 13 90.0 87.89 66.0 146 Female
14 14 96.0 86.54 68.0 135 Male
15 15 120.0 85.22 68.5 127 Male
16 16 102.0 94.51 73.5 178 Female
17 17 84.0 80.80 66.3 136 Female
18 18 86.1 88.91 70.0 180 Female
19 19 84.0 90.59 76.5 186 Female
20 20 134.0 79.06 62.0 122 NaN
21 21 128.0 95.50 68.0 132 Male
22 22 102.0 -83.18 63.0 114 Female
23 23 131.0 93.55 -72.0 171 Male
24 24 84.0 79.86 68.0 140 Female
25 25 110.0 106.25 77.0 187 Female
26 26 72.0 79.35 63.0 106 Female
27 27 124.0 86.67 66.5 159 Male
28 28 132.0 85.78 62.5 127 Female
29 29 137.0 94.96 67.0 191 Male
30 30 110.0 99.79 75.5 192 Female
31 30 86.0 88.00 69.0 0 Male
32 32 81.0 83.43 66.5 143 Female
33 33 128.0 94.81 66.5 153 Male
34 34 NaN 94.94 70.5 144 Female
35 35 94.0 89.40 64.5 139 Male
36 36 74.0 93.00 NaN 148 Female
37 37 89.0 93.59 75.5 179 Female