Exercice:
Take countries_GDP.csv and convert it to a valid csv file without header.
In [148]:
import pandas as pd
import numpy as np
#read csv as data frame
df_gdp_raw = pd.read_csv("../data/countries_GDP.csv")
#select columns and use these that have data in 'Unamed:0', which
#actually is the country code
df_gdp = df_gdp_raw[[0,1,3,4]][df_gdp_raw['Unnamed: 0'].notnull()]
#rename columns and index
df_gdp.columns=["country_code","position","country_name","gdp"]
df_gdp.index = range(df_gdp.shape[0])
#show head
print df_gdp.head()
#show types, take into account that gdp should be integer
print df_gdp.dtypes
#change gdp dtype to numeric
df_gdp.gdp = df_gdp.gdp.apply(lambda x: x.replace(",","").strip(" "))
df_gdp.gdp = pd.to_numeric(df_gdp.gdp,errors="coerce")
print df_gdp.gdp.dtype
print df_gdp.head()
print df_gdp.tail()
#save as csv, set header as false
df_gdp.to_csv("../data/countries_GDP_clean.csv",header=False,sep=";")
Exercice:
Clean countries data and save it as a valid csv without header.
In [68]:
import pandas as pd
import numpy as np
df_country_raw = pd.read_csv("../data/countries_data.csv",sep=";")
df_country_raw.head(15)
df_country_raw.to_csv("../data/countries_data_clean.csv",header=False)
Exercice:
Build a function that generates a dataframe with N user id plus a list of a random number of random news topics from news_topics.csv
In [299]:
import pandas as pd
import numpy as np
def generate_users_df(num_users, num_topics):
#generate num_users usernames
usernames_df = pd.Series(["user"]*num_users).str.cat(pd.Series(np.arange(num_users)).map(str))
#read topics csv
news_topics = pd.read_csv("../data/news_topics.csv",header=None)
#generate a list of N int picked uniformly random from range 0 .. num_topics
#WARNING: is really an uniform distribution??
rand_ints = pd.Series(np.random.randint(1,num_topics+1,num_users))
#WARNING: what happens if x>len(news_topics)
topics_df = rand_ints.apply(lambda x: "|".join(np.random.choice(news_topics.T[0],x,replace=False)))
return pd.concat({'username':usernames_df,'topics':topics_df},axis=1)
M = 5
N = 100
users_df = generate_users_df(N,M)
users_df.head(10)
Out[299]:
Exercice:
Save the info generated with the previous function as csv so that it can be easily loaded as a Pair RDD in pyspark.
In [300]:
import csv
M = 20
N = 1000
users_df = generate_users_df(N,M)
users_df.to_csv("../data/users_events_example/user_info_%susers_%stopics.csv" % (N,M),
columns=["username","topics"],
header=None,
index=None)
#quoting=csv.QUOTE_MINIMAL)
Exercice:
Build a function that generates N csv files containing user's web browsing information. This function takes a max number of users M (from user0 to userM) and generates K user information logs for a randomly picked user (with repetition). The function will return this information with a timestamp. Each file represents 5 minute activity, the activity period will be K/300. The activity information is a random selection of 1 element over news topics.
In [301]:
import datetime
def generate_user_events(date_start, num_files, num_users, num_events):
#generate usernames
usernames_df = pd.Series(["user"]*num_users).str.cat(pd.Series(np.arange(num_users)).map(str))
#read topics
news_topics = pd.read_csv("../data/news_topics.csv",header=None,lineterminator="\n").T
#create time index
df_index = pd.date_range(date_start,
periods=num_events,
freq=pd.DateOffset(seconds=float(5*60)/num_events))
#generate data
event_data = {"user" : np.random.choice(usernames_df,num_events,replace=True),
"event" : np.random.choice(news_topics[0],num_events,replace=True)}
#generate df
return pd.DataFrame(event_data, index = df_index, columns=["user", "event"])
num_files = 10
num_users = 100
num_events = 1000
date_start = datetime.datetime.strptime('1/1/2016', '%d/%m/%Y')
for idx,i in enumerate(range(num_files)):
print "File ",idx+1," of ", num_files, " at ",date_start
userevent_df = generate_user_events(date_start, num_files, num_users, num_events)
file_name = "../data/users_events_example/userevents_" + date_start.strftime("%d%m%Y%H%M%S") + ".log"
userevent_df.to_csv(file_name, header=None)
date_start = date_start + datetime.timedelta(0,300)
Exercice:
Generate a unique id for papers.lst file and save it as a csv file. Then generate a csv file with random references among papers from papers.lst.
In [66]:
import csv, re
import pandas as pd
import numpy as np
f = file("../data/papers.lst","rb")
papers = []
for idx,l in enumerate(f.readlines()):
t = re.match("(\d+)(\s*)(.\d*)(\s*)(\w+)(\s*)(.*)",l)
if t:
#print "|",t.group(1),"|",t.group(3),"|",t.group(5),"|",t.group(7),"|"
papers.append([t.group(1),t.group(3),t.group(5),t.group(7)])
papers_df = pd.DataFrame(papers)
papers_df.to_csv("../data/papers.csv", header = None)
N = papers_df.shape[0]
#let's assume that a paper can have 30 references at max and 5 at min
M = 30
papers_references = pd.DataFrame(np.arange(N))
papers_references[1] = papers_references[0].apply(
lambda x:
";".join(
[str(x) for x in np.random.choice(papers_references[0],np.random.randint(5,M))]))
papers_references.columns = ["paper_id","references"]
papers_references.to_csv("../data/papers_references.csv",header=None,index=None)
Exercice:
Read "../data/country_info_worldbank.xls" and delete wrong rows and set proper column names.
In [147]:
import pandas as pd
cc_df0 = pd.read_excel("../data/country_info_worldbank.xls")
#delete unnececary rows
cc_df1 = cc_df0[cc_df0["Unnamed: 2"].notnull()]
#get columnames and set to dataframe
colnames = cc_df1.iloc[0].tolist()
colnames[0] = "Order"
cc_df1.columns = colnames
#delete void columns
cc_df2 = cc_df1.loc[:,cc_df1.iloc[1].notnull()]
#delete first row as it is colnames
cc_df3 = cc_df2.iloc[1:]
#reindex
cc_df3.index = np.arange(cc_df3.shape[0])
cc_df3[:]["Economy"] = cc_df3.Economy.str.encode('utf-8')
cc_df3.to_csv("../data/worldbank_countrycodes_clean.csv")
Exercice:
Convert lat lon to UTM
In [3]:
import pandas as pd
est_df = pd.read_csv("../data/estacions_meteo.tsv",sep="\t")
est_df.head()
est_df.columns = est_df.columns.str.lower().\
str.replace("\[codi\]","").\
str.replace("\(m\)","").str.strip()
est_df.longitud = est_df.longitud.str.replace(",",".")
est_df.latitud = est_df.latitud.str.replace(",",".")
est_df.longitud = pd.to_numeric(est_df.longitud)
est_df.latitud = pd.to_numeric(est_df.latitud)
Exercice:
Convert to Technically Correct Data: iqsize.csv
In [1]:
import pandas as pd
df = pd.read_csv("../data/iqsize.csv", na_values="n/a")
df.dtypes
#clean piq
errors = pd.to_numeric(df.piq, errors="coerce")
print df["piq"][errors.isnull()]
df["piq"] = pd.to_numeric(df["piq"].str.replace("'","."))
df.dtypes
errors = pd.to_numeric(df.height, errors="coerce")
print df["height"][errors.isnull()]
df["height"] = pd.to_numeric(df["height"].str.replace("'","."))
df.dtypes
df.sex.unique()
df.sex = df.sex.str.replace("Woman","Female")
df.sex = df.sex.str.replace("woman","Female")
df.sex = df.sex.str.replace("woman","Female")
df.sex = df.sex.str.replace("man","Male")
df.sex = df.sex.str.replace("Man","Male")
df.sex.unique()
df.to_csv("../data/iqsize_clean.csv",index=None)
In [29]:
df = pd.read_csv("../data/iqsize_clean.csv")
df
Out[29]: