In [162]:
%matplotlib inline
from IPython.display import Image
In [229]:
#This is a comment
print("Hello World")
In [231]:
## HOW TO IMPORT PACKAGES
## Create a dataframe (spreadsheet) called spreadsheet
#Way 1 (recommended)
import pandas
spreadsheet = pandas.DataFrame()
import pandas as pd
spreadsheet = pd.DataFrame()
#Way 2
from pandas import DataFrame
spreadsheet = DataFrame()
from pandas import *
spreadsheet = DataFrame()
In [233]:
import pandas as pd
In [234]:
df = pd.DataFrame()
In [235]:
df?
To install new packages you can use pip. Click in the "Anaconda Prompt" (under the start menu) and write:
In [217]:
## DATA TYPES
this_is_variable1 = 3.5
this_is_variable3 = "I'm a string"
this_is_variable2 = False
## DATA STRUCTURES (list)
this_is_variable4 = [3.5,"I'm another string",4]
this_is_variable5 = [this_is_variable1,this_is_variable2,this_is_variable3,this_is_variable4]
In [218]:
print(this_is_variable5)
In [167]:
print(type(3))
print(type(3.5))
print(type("I'm a string"))
print(type(False))
The computer uses 0s and 1s to encode strings
We used to use ASCII encoding, that reads blocks of 5 numbers (0/1). 2^7 = 128. Enough for lower and upper case letters and some puntuation, but not for weird symbols (e.g. é,ó,í). It's the default of python2 (bad for text analysis).
Nowadays we use UTF-8 encoding, that can handle all symbols in any language. It's the default of python3.
But some programs use UTF-16, ASCII or ISO-5589-1, which makes everything crappy. If sometime you're reading a file and the content are weird symbols this is likely the problem. Look for an "encoding" option in the code.
In [238]:
##THE COMPUTER READS IT LINE BY LINE, UP_DOWN
## OPERATIONS ON DATA TYPES
print(3*5.0)
print(3 == 5)
b = 3
print(b == 3)
b = 5
print(b == 5)
## CONVERT BETWEEN TYPES
print(b)
print(type(b))
b = float(b)
print(b)
print(type(b))
In [239]:
3 = 4
In [240]:
this_is_a_variable6 = not_defined_variable
In [241]:
[1,2,3]
Out[241]:
In [169]:
## A list
print([0]*10)
In [170]:
## A list
this_is_a_list = [1,3,2,"b"]
print("Original: ", this_is_a_list)
In [248]:
## Add elements
this_is_a_list.append("Hoi")
print("Added Hoi: ", this_is_a_list)
In [242]:
## Get element
print("Fourth element: ", this_is_a_list[3])
In [243]:
[0,1,2,3,4]
Out[243]:
In [173]:
## Get slices
print("Second to end element: ", this_is_a_list[1:])
In [249]:
print(this_is_a_list)
print(this_is_a_list[1:3])
print(this_is_a_list[:-1])
In [174]:
## Remove 4th element
this_is_a_list.pop(3)
print("Removed fourth element: ", this_is_a_list)
In [175]:
#Search
"Hoi" in this_is_a_list
Out[175]:
In [176]:
## Sort
this_is_a_list = [1, 3, 2]
this_is_a_list.sort()
print("Sorted: ", this_is_a_list)
ipython help
In [177]:
this_is_a_list?
In [250]:
this_is_a_list.pop?
In [179]:
this_is_a_tuple = (1,3,2,"b")
print(this_is_a_tuple)
this_is_a_list = list(this_is_a_tuple)
print(this_is_a_list)
In [251]:
print(list(range(3,16)))
print(list(range(3,16,3)))
In [181]:
Image(filename='./images/setOp.png')
Out[181]:
In [182]:
this_is_a_set1 = set([1,1,1,2,3])
this_is_a_set2 = set({1,2,4})
print(this_is_a_set1)
print(this_is_a_set2)
In [183]:
## Union
print(this_is_a_set1 | this_is_a_set2)
In [184]:
## Intersection
print(this_is_a_set1 & this_is_a_set2)
In [185]:
## Diference set_1 - set2
print(this_is_a_set1 - this_is_a_set2)
In [252]:
#Dictionary
this_is_a_dict = {"Javier": 63434234234, "Friend1": 4234423243, "Friend2": 4234423243}
print(this_is_a_dict)
In [188]:
this_is_a_dict["Friend2"]
Out[188]:
In [189]:
#Creating dictionary using two lists
list_names = ["Javier", "Friend1", "Friend2"]
list_numbers = [63434234234,4234423243,424233345]
#Put both together
this_is_a_dict = dict(zip(list_names,list_numbers))
print(this_is_a_dict)
In [190]:
print(zip(list_names,list_numbers))
In [191]:
print(list(zip(list_names,list_numbers)))
In [ ]:
In [255]:
import time
numElements = 50000
this_is_a_list = list(range(numElements))
print(this_is_a_list[:10])
this_is_a_dict = dict(zip(range(numElements),[0]*numElements))
print(([0]*numElements)[:10])
start = time.time()
for i in range(numElements): this_is_a_dict.get(i)
print(time.time() - start)
start = time.time()
for i in range(numElements): this_is_a_list.index(i)
print(time.time() - start)
In [257]:
import numpy as np
import scipy.stats
numElements = 1000
this_is_an_list = list(range(numElements))
this_is_an_array = np.array(range(numElements))
print(this_is_an_array[:10])
## Mean
print(np.mean(this_is_an_array))
print(np.std(this_is_an_array))
print(np.median(this_is_an_array))
print(scipy.stats.mode(this_is_an_array))
print(scipy.stats.skew(this_is_an_array))
What's a CSV = comma separated values file
newspaper,number_something ABC,1 ABC,3 ABC,2 ElPais,10 ElPais, 15
Can be separated by other things and it's still called a csv (tsv something with tabs).
In [258]:
import pandas as pd
## Read excel
excelFrame = pd.read_excel("./data/nl_data.xlsx",sheetname = 0, header = 0,skiprows=4)#
## Read csv
csvFrame = pd.read_csv("./data/nl_data.csv",sep=",",index_col=None,skiprows=4,na_values=["-999"])
## Print first 5 rows
csvFrame.head()
Out[258]:
In [195]:
## Describe
print(csvFrame.describe())
In [196]:
## Calculate mean
print(csvFrame.mean(axis=0)) #By columns
print(csvFrame.mean(axis=1)) #By rows
In [197]:
## Keep a subset
print(csvFrame["Indicator Code"])
In [198]:
print(csvFrame["Indicator Code"]=="AG.LND.AGRI.K2")
In [199]:
print(csvFrame.loc[3,:])
In [200]:
csvFrame.loc[csvFrame["Indicator Code"]=="AG.LND.AGRI.K2",["Indicator Code","1970","1971"]]
Out[200]:
In [201]:
### More advanced stuff
import pylab as plt
## Plot
print(csvFrame.columns)
columns_to_keep = ['1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
'1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
'1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
'1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
'2015']
print(csvFrame.loc[3,"Indicator Name"])
print(csvFrame.loc[5,"Indicator Name"])
years = []
for year in columns_to_keep:
years.append(int(year))
print(years)
plt.plot(years,csvFrame.loc[3,columns_to_keep]*100,color="blue",label="Agricultural land")
plt.plot(years,csvFrame.loc[5,columns_to_keep],color="red",label="Arable land")
plt.xlabel("Year")
plt.ylabel("Land (hectares)")
plt.legend()
plt.show()
In [202]:
csvFrame2 = csvFrame.loc[:,["Indicator Name"]+columns_to_keep[-22:-2]]
csvFrame_tranposed = csvFrame2.set_index("Indicator Name").transpose()
csvFrame_tranposed = csvFrame_tranposed.dropna(axis=1)
csvFrame_tranposed
csvFrame_transposedShort = csvFrame_tranposed[["Agricultural land (sq. km)","Arable land (hectares)","Cereal production (metric tons)"]]
csvFrame_transposedShort.columns = ["agric_land","arable_land","cereal_prod"]
In [203]:
import statsmodels.formula.api as sm
result = sm.ols(formula="cereal_prod ~ agric_land * arable_land", data=csvFrame_transposedShort).fit()
result.summary()
Out[203]:
In [204]:
result = sm.ols(formula="cereal_prod ~ agric_land + arable_land", data=csvFrame_transposedShort).fit()
result.summary()
Out[204]:
In [259]:
result = sm.ols(formula="cereal_prod ~ arable_land", data=csvFrame_transposedShort).fit()
result.summary()
Out[259]:
In [205]:
data = pd.DataFrame(columns=["Year", "Newspaper","Number_something"])
data = data.append({"Newspaper":"ABC","Number_something": 1},ignore_index=True)
data = data.append({"Newspaper":"ABC","Number_something": 2},ignore_index=True)
data = data.append({"Newspaper":"ABC","Number_something": 1},ignore_index=True)
data = data.append({"Newspaper":"ElPais","Number_something": 10},ignore_index=True)
data = data.append({"Newspaper":"ElPais","Number_something": 25},ignore_index=True)
data = data.append({"Newspaper":"ElPais","Number_something": 25},ignore_index=True)
print(data)
meanData = data.groupby("Newspaper").mean().reset_index()
meanData.plot(x = "Newspaper", y = "Number_something", kind="bar",edgecolor="none",color=(70/255,140/255,210/255),legend=False)
plt.show()
In [206]:
## OPERATIONS ON DATA TYPES
print(3*5.0)
print(3 == 5)
b = 3
print(b == 3)
b = 5
print(b == 5)
## CONVERT BETWEEN TYPES
print(type(b))
b = float(b)
print(type(b))
In [207]:
##INDENTATION!!
## Our own functions
def mean(listOfNumbers):
return np.sum(listOfNumbers)/len(listOfNumbers)
In [208]:
aList = [2,3,4]
print(mean(aList))
In [209]:
s = "I hate spam."
## What's s?
def f():
s = "Me too."
return s
f()
print(s)
In [210]:
s = "I hate spam."
## What's s?
def f():
s = "Me too."
return s
s = f()
print(s)
In [262]:
#Count words
aDict = dict({"Bob":5, "Pep":3})
name = input("Enter Bob or Pep: ")
if name == "Bob":
aDict["Bob"] = aDict["Bob"] + 1
elif name == "Pep":
aDict["Pep"] = aDict["Pep"] + 1
else:
print("Wrong name")
print(aDict)
In [276]:
import numpy as np
list_numbers = [1,9,121,2335432432432423434877733543544533.]
print(np.sqrt(list_numbers[0]))
print(np.sqrt(list_numbers[1]))
print(np.sqrt(list_numbers[2]))
print(np.sqrt(list_numbers[3]))
#...
In [279]:
for index in [0,1,2,3]:
print(index)
In [280]:
for index in [0,1,2,3]:
print(np.sqrt(list_numbers[index]))
In [283]:
for element in list_numbers:
print(np.sqrt(element))
In [284]:
#Slice just like lists
this_is_a_string = "Hello my name is"
print(this_is_a_string[:10])
In [286]:
print("-"*10)
In [285]:
#Upper and lower case
print("Hello All".lower())
print("Hello All".upper())
In [287]:
#Strip end spaces or return characters
this_is_a_string = "Hello my name is\n" #tab = "\t
print("-"*10)
print(this_is_a_string)
print("-"*10)
print(this_is_a_string.strip())
print("-"*10)
In [290]:
#Formatting (\t = tab, \n = return)
print("{0}\t{1}-----{2}\n".format("Hello","my name","is"))
In [369]:
#Delete punctuation
def remove_punctuation(string_to_remove):
import string
transtable = {ord(c): None for c in string.punctuation}
return string_to_remove.translate(transtable).lower()
initial_string = "Hello. I'm having breakfast with my brothers. A nice one"
print(initial_string)
new_string = remove_punctuation(initial_string)
print(new_string)
In [375]:
#Remove endings
def stem_string(string_to_stem,language="english"):
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language)
return " ".join([stemmer.stem(word) for word in string_to_stem.split(" ")])
new_string = stem_string(new_string)
print(new_string)
In [371]:
#Splitting. Covnert them to something that we can iterate form
splitted_text = new_string.split(" ")
print(splitted_text)
In [372]:
#Join them
joined_text = " ".join(splitted_text)
print(joined_text)
In [ ]:
#Download package
import nltk
from nltk.corpus import stopwords
nltk.download()
In [373]:
cached_stop = stopwords.words("english")
print(cached_stop)
def remove_stop_words_not_obscure(text):
#split
text = text.split()
#remove stop words
new_text = []
for word in text:
if word in cached_stop: pass
else: new_text.append(word)
#join together
text = ' '.join(new_text)
return text
def remove_stop_words(text):
return ' '.join([word for word in text.split() if word not in cached_stop])
print()
print(joined_text)
print(remove_stop_words(new_string))
In [300]:
def updateDictionary(aDict,name):
if aDict.get(name):
aDict[name] = aDict[name] + 1
else:
aDict[name] = 1
return aDict
text = "That is what happens when you flee your homeland. You don’t know that you are going to become part of a flood of refugees. I later would learn that I was one of the nearly 130,000 people who fled Saigon that day and one of the estimated two million “boat people” who fled Vietnam by boat and other means over the next two decades. But I didn’t set out to come to America; I left my house when my parents said I should."
print(text)
print()
text_no_punc = remove_punctuation(text)
print(text_no_punc)
print()
text_no_stop = remove_stop_words(text_no_punc)
print(text_no_stop)
print()
aDict = dict()
list_text_no_stop = text_no_stop.split()
print(list_text_no_stop)
print()
for word in list_text_no_stop:
aDict = updateDictionary(aDict,word)
print(aDict)
print()
In [301]:
from collections import Counter
print(Counter(list_text_no_stop))
Dates are nasty. What date is this? 05/06/2015
Luckily we have Python
from dateutils.parser import parse http://dateutil.readthedocs.org/en/latest/parser.html#dateutil.parser.parse
dayfirst – Whether to interpret the first value in an ambiguous 3-integer date (e.g. 01/05/09) as the day (True) or month (False). If yearfirst is set to True, this distinguishes between YDM and YMD. If set to None, this value is retrieved from the current parserinfo object (which itself defaults to False).
yearfirst – Whether to interpret the first value in an ambiguous 3-integer date (e.g. 01/05/09) as the year. If True, the first number is taken to be the year, otherwise the last number is taken to be the year. If this is set to None, the value is retrieved from the current parserinfo object (which itself defaults to False).
fuzzy – Whether to allow fuzzy parsing, allowing for string like “Today is January 1, 2047 at 8:21:00AM”.
In [307]:
from dateutil.parser import parse
print(parse("05-06-2015",dayfirst=True).date())
print(parse("05-06-2015",dayfirst=False).date())
print(parse("05/06-2015").date())
print(parse("Today is January 1, 2047 at 8:21:00AM",fuzzy=True).date())
In [309]:
with open("./data/file_to_write.csv","w+") as f:
f.write("I'm line number {0}".format(0))
f.write("I'm line number {0}".format(1))
f.write("I'm line number {0}".format(2))
f.write("I'm line number {0}".format(3))
f.write("I'm line number {0}".format(4))
In [310]:
with open("./data/file_to_write.csv","w+") as f:
f.write("I'm line number {0}\n".format(0))
f.write("I'm line number {0}\n".format(1))
f.write("I'm line number {0}\n".format(2))
f.write("I'm line number {0}\n".format(3))
f.write("I'm line number {0}\n".format(4))
In [312]:
list(range(10))
Out[312]:
In [313]:
#Beware the enter
with open("./data/file_to_write.csv","w+") as f:
for i in range(10):
f.write("I'm line number {0}\n".format(i))
In [317]:
with open("./data/file_to_write.csv","r+") as f:
splitted_by_line_1 = f.readlines()
with open("./data/file_to_write.csv","r+") as f:
all_together = f.read()
splitted_by_line_2 = all_together.split("\n")
splitted_by_line_3 = []
with open("./data/file_to_write.csv","r+") as f:
for line in f:
splitted_by_line_3.append(line)
In [318]:
print(splitted_by_line_1)
print(splitted_by_line_2)
print(splitted_by_line_3)
In [ ]:
#The strip removes the return and all that
splitted_by_line_3 = []
with open("./data/file_to_write.csv","r+") as f:
for line in f:
splitted_by_line_3.append(line.strip())
print(splitted_by_line_3)
In [ ]:
try:
f = open("./data/file_to_write.csv","r+")
for line in f:
splitted_by_line_3.append(line.strip())
print(splitted_by_line_3)
f.close()
except:
f.close()
In [319]:
Image("http://i.imgur.com/WRuJV6r.png")
Out[319]:
In [320]:
this_is_a_list = [1,2,3,4,5]
len_list = len(this_is_a_list)
print(len_list)
this_is_a_list[len_list]
In [321]:
this_is_a_list = [1,2,3,4,5]
for element in this_is_a_list:
this_is_a_list.pop(-1)
print(element)
In [323]:
this_is_a_list = [1,2,3,4,5]
for element in this_is_a_list:
sum_all = sum_all + element
In [324]:
def function()
return 0
In [326]:
3 = 5
In [327]:
3 == "3"
Out[327]:
In [328]:
3 == int("3")
Out[328]:
In [330]:
"A" == "a"
Out[330]:
In [331]:
open("non_existing_file","r")
In [332]:
d = dict({"You": 0, "Her": 1})
d["Him"]
In [333]:
this_is_a_list = [0,1,2,3,4]
this_is_a_list + 8
In [334]:
this_is_a_list = [0,1,2,3,4]
this_is_a_list + [8]
Out[334]:
In [335]:
this_is_a_list = [0,1,2,3,4]
this_is_a_list.add(8)
In [336]:
this_is_a_list = [4,3,2,1,0]
this_is_a_list = sorted(this_is_a_list)
print(this_is_a_list)
In [337]:
this_is_a_list = [4,3,2,1,0]
this_is_a_list = this_is_a_list.sort()
print(this_is_a_list)
In [338]:
this_is_a_list = [4,3,2,1,0]
this_is_a_list.sort() #IN-PLACE SORTING!!
print(this_is_a_list)