In [1]:
# import the necessary package at the very beginning
import numpy as np
import pandas as pd
In [2]:
print(str(float(100*177/891)) + '%')
Example:
def AddOne(x):
y=x+1
return y
addOneLambda = lambda x: x+1
In [3]:
def foolOne(x): # note: assume x is a number
y = x * 2
y -= 25
return y
In [4]:
## Type Your Answer Below ##
foolOne_lambda = lambda x: x*2-25
In [5]:
# Generate a random 3*4 matrix for test
tlist = np.random.randn(3,4)
tlist
Out[5]:
In [6]:
# Check if the lambda function yields same results as previous function
def test_foolOne(tlist, func1, func2):
if func1(tlist).all() == func2(tlist).all():
print("Same results!")
test_foolOne(tlist, foolOne, foolOne_lambda)
In [15]:
def foolTwo(x): # note: assume x here is a string
if x.startswith('g'):
return True
else:
return False
In [16]:
## Type Your Answer Below ##
foolTwo_lambda = lambda x: x.startswith('g')
In [34]:
# Generate a random 3*4 matrix of strings for test
# reference: https://pythontips.com/2013/07/28/generating-a-random-string/
# reference: http://www.programcreek.com/python/example/1246/string.ascii_lowercase
import random
import string
def random_string(size):
new_string = ''.join([random.choice(string.ascii_letters + string.digits) for n in range(size)])
return new_string
def test_foolTwo():
test_string = random_string(6)
if foolTwo_lambda(test_string) == foolTwo(test_string):
return True
for i in range(10):
if test_foolTwo() is False:
print('Different results!')
In [59]:
## Type Your Answer Below ##
# reference: https://docs.python.org/3/tutorial/datastructures.html
# tuple is immutable. They cannot be changed once they are made.
# tuples are easier for the python interpreter to deal with and therefore might end up being easier
# tuples might indicate that each entry has a distinct meaning and their order has some meaning (e.g., year)
# Another pragmatic reason to use tuple is when you have data which you know should not be changed (e.g., constant)
# tuples can be used as keys in dictionaries
# tuples usually contain a heterogeneous sequence of elements that are accessed via unpacking or indexing (or even by attribute in the case of namedtuples).
tuple1 = (1, 2, 3, 'a', True)
print('tuple: ', tuple1)
print('1st item of tuple: ', tuple1[0])
tuple1[0] = 4 # item assignment won't work for tuple
In [70]:
# tuple with just one element
tuple2 = (1) # just a number, so has no elements
print(type(tuple2))
tuple2[0]
In [71]:
# tuple with just one element
tuple3 = (1, )
print(type(tuple3))
tuple3[0]
Out[71]:
In [82]:
# Question for TA: is tuple comprehension supported?
tuple4 = (char for char in 'abcdabcdabcd' if char not in 'ac')
print(tuple4)
In [90]:
# Question for TA: is the following two tuples the same?
tuple4= (1,2,'a'),(True, False)
tuple5 = ((1,2,'a'),(True, False))
print(tuple4)
print(tuple5)
In [65]:
# lists' elements are usually homogeneous and are accessed by iterating over the list.
list1 = [1, 2, 3, 'a', True]
print('list1: ', list1)
print('1st item of list: ', list1[0])
list1[0] = 4 # item assignment works for list
# list comprehensions
list_int = [element for element in list1 if type(element)==int]
print("list_int", list2)
In [77]:
## Type Your Answer Below ##
# A set is an unordered collection with no duplicate elements.
# set() can be used to eliminate duplicate entries
list1 = ['apple', 'orange', 'apple', 'pear', 'orange', 'banana']
set1 = set(list1)
print(set1)
# set can be used for membership testing
set2 = {1, 2, 'abc', True}
print('abc' in set2) # membership testing
set1[0] # set does not support indexing
In [79]:
# set comprehensions
set4 = {char for char in 'abcdabcdabcd' if char not in 'ac'}
print(set4)
Set and list are implemented using two different data structures - Hash tables and Dynamic arrays. . Python lists are implemented as dynamic arrays (which can preserve ), which must be searched one by one to compare every single member for equality, with lookup speed O(n) depending on the size of the list. . Python sets are implemented as hash tables, which can directly jump and locate the bucket (the position determined by the object's hash) using hash in a constant speed O(1), regardless of the size of the set.
In [99]:
# Calculate the time cost differences between set and list
import time
import random
def compute_search_speed_difference(scope):
list1 = []
dic1 = {}
set1 = set(dic1)
for i in range(0,scope):
list1.append(i)
set1.add(i)
random_n = random.randint(0,100000) # look for this random integer in both list and set
list_search_starttime = time.time()
list_search = random_n in list1
list_search_endtime = time.time()
list_search_time = list_search_endtime - list_search_starttime # Calculate the look-up time in list
#print("The look up time for the list is:")
#print(list_search_time)
set_search_starttime = time.time()
set_search = random_n in set1
set_search_endtime = time.time()
set_search_time = set_search_endtime - set_search_starttime # Calculate the look-up time in set
#print("The look up time for the set is:")
#print(set_search_time)
speed_difference = list_search_time - set_search_time
return(speed_difference)
def test(testing_times, scope):
test_speed_difference = []
for i in range(0,testing_times):
test_speed_difference.append(compute_search_speed_difference(scope))
return(test_speed_difference)
#print(test(1000, 100000)) # test 10 times can print out the time cost differences
print("On average, the look up time for a list is more than a set in:")
print(np.mean(test(100, 1000)))
Pandas series (which can contain values of different data types) is much more general and flexible than the one-dimensional Numpy array(which can only contain one data type).
While Numpy array has an implicitly defined integer used to access the values, the Pandas series has an explicitly defined index (which can be any data type) associated with the values (which gives the series object additonal capabilities).
. Numpy is a libary for efficient array computations, modeled after Matlab. Arrays differ from plain Python lists in the way they are stored and handled. Array elements stay together in memory, so they can be quickly accessed. Numpy also supports quick subindexing (a[0,:,2]). Furthermore, Numpy provides vectorized mathematical functions (when you call numpy.sin(a), the sine function is applied on every element of array a), which are faster than a Python for loop.
. Pandas library is good for analyzing tabular data for exploratory data analysis, statistics and visualization. It's used to understand the data you have.
. Scipy provides large menu of libraries for scientific computation, such as integration, interpolation, signal processing, linear algebra, statistics. It's built upon the infrastructure of Numpy. It's good for performing scientific and engineering calculation.
. Scikit-learn is a collection of advanced machine-learning algorithms for Python. It is built upon Numpy and SciPy. It's good to use the data you have to train a machine-learning algorithm.
In [100]:
## Type Your Answer Below ##
student = np.array([0, 'Alex', 3, 'M'])
print(student) # all the values' datatype is converted to str
You can download the data from the following link:
https://www.kaggle.com/c/titanic/data
In [123]:
## Type Your Answer Below ##
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv')
df.sample(3)
Out[123]:
In [124]:
df.tail(3)
Out[124]:
In [112]:
df.describe()
Out[112]:
In [108]:
df.info()
In [135]:
## Type Your Answer Below ##
len(df[df.age.isnull()])/len(df)*100
Out[135]:
In [136]:
## Type Your Answer Below ##
df.embarked.value_counts()
Out[136]:
In [147]:
print('number of classes: ', len(df.embarked.value_counts().index))
print('names of classes: ', df.embarked.value_counts().index)
In [151]:
# Another method
embarked_set = set(df.embarked)
print(df.embarked.unique())
In [164]:
## Type Your Answer Below ##
male_survived = df[df.survived==1][df.sex=='male']
male_survived_n = len(df.query('''sex=='male' and survived ==1'''))
In [166]:
female_survived = df[df.survived==1][df.sex=='female']
female_survived_n = len(df.query('''sex=='female' and survived ==1'''))
In [198]:
df_survived = pd.DataFrame({'male':male_survived_n, 'female': female_survived_n}, index=['Survived_number'])
df_survived
Out[198]:
In [199]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [208]:
df_survived.plot(kind='bar', title='survived female and male', legend='True')
Out[208]:
In [212]:
sns.pointplot(x='embarked', y='survived', hue='sex', data=df, palette={'male':'blue', 'female':'pink'}, markers=["*", "o"], linestyles=['-', '--'])
Out[212]:
In [215]:
grid = sns.FacetGrid(df, col='embarked')
grid.map(sns.pointplot, 'pclass', 'survived', 'sex', palette={'male':'blue', 'female':'pink'}, markers=["*", "o"], linestyles=['-', '--'])
grid.add_legend()
Out[215]:
In [217]:
grid = sns.FacetGrid(data_train, col='pclass')
grid.map(sns.barplot, 'embarked', 'age', 'sex')
grid.add_legend()
Out[217]:
Use 'Pclass'and 'Sex' in estimating missing values in 'Age'.
In [218]:
## Type Your Answer Below ##
df_23=df.query('''age>23''')
df_23
Out[218]:
In [275]:
# first split name into string lists by ' '
def format_name(df):
df['split_name'] = df.name.apply(lambda x: x.split(' '))
return df
print(df.sample(3).split_name, '\n')
# for each subset string of name, check if "jack" or "rose" in it
for i in format_name(df).split_name:
for l in i:
if (("jack" in l.lower()) | ("rose" in l.lower()) ):
print("found names that contain jack or rose: ", l)
In [243]:
## Type Your Answer Below ##
df4 = df.query('''pclass==1''')
def percent(x):
m = int(x.count())
n = m/len(df4)
return(n)
df[['survived','pclass']].query('''pclass==1''').groupby([ 'survived']).agg({'pclass':percent})
Out[243]: