From loops and conditionals, to working with data files


In [1]:
# reviewing looping through a list
my_list = [1, 2, 3, 4, 5]
for value in my_list:
    print(value ** 2, end=' ')


1 4 9 16 25 

In [2]:
# reviewing range() to loop a set number of times
for value in range(5):
    print((value + 1) ** 2, end=' ')


1 4 9 16 25 

In [3]:
# testing values as we loop through a list
my_value = 2
for value in range(5):
    if value == my_value:
        print('found it')


found it

In [4]:
# encapsulate with a function and find if some value appears in some list
def find_value(my_value, my_list):
    is_found = False
    for value in my_list:
        if value == my_value:
            is_found = True
    return is_found

# now use our function
user_id = 3
user_ids = [1, 2, 3, 4, 5, 6]
if find_value(user_id, user_ids):
    print('found it')
else:
    print('did not find it')


found it

In [5]:
# what if we want to raise all the numbers in a list to the 2nd power?
range(10) ** 2


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-29289dd35655> in <module>()
      1 # what if we want to raise all the numbers in a list to the 2nd power?
----> 2 range(10) ** 2

TypeError: unsupported operand type(s) for ** or pow(): 'range' and 'int'

In [6]:
# oops, error! instead, use list comprehension: do the operation element-wise
[ x ** 2 for x in range(10) ]


Out[6]:
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

Now we will use these same loops and conditionals to work with a data file


In [7]:
# import python's csv module to open a csv file
import csv

In [8]:
# with guarantees the file will be closed even if we hit an exception
# rt means 'read' and 'text mode'
with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    my_csv = csv.reader(csvfile)
    for row in my_csv:
        print(row)


['neighborhood', 'price', 'bedrooms', 'date', 'sqft', 'longitude', 'latitude']
['foster city', '2495', '1', '11/14/2014 12:26', '755', '-122.27', '37.5538']
['palo alto', '2695', '', '11/14/2014 12:25', '443', '-122.161524', '37.450289']
['brisbane', '3150', '2', '11/14/2014 12:24', '1242', '-122.417912', '37.692415']
['palo alto', '2800', '2', '11/14/2014 12:24', '', '', '']
['san mateo', '2196', '1', '11/14/2014 12:24', '676', '-122.2998', '37.5395']
['santa clara', '3264', '3', '11/14/2014 12:28', '1138', '', '']
['san jose south', '2000', '2', '11/14/2014 12:28', '822', '-121.902268', '37.253503']
['sunnyvale', '4740', '3', '11/14/2014 12:28', '1406', '-122.034683', '37.368445']
['inner sunset / UCSF', '3395', '2', '11/14/2014 12:32', '', '-122.479345', '37.764582']
['richmond / seacliff', '2699', '1', '11/14/2014 12:32', '', '-122.503781', '37.7718']
['SOMA / south beach', '3620', '1', '11/14/2014 12:30', '860', '-122.395195', '37.775133']
['dublin / pleasanton / livermore', '2025', '1', '11/14/2014 12:18', '636', '-121.787665', '37.67963']
['concord / pleasant hill / martinez', '', '2', '11/14/2014 12:18', '1019', '-122.035275', '37.975259']
['hercules, pinole, san pablo, el sob', '1795', '1', '11/14/2014 12:17', '715', '-122.321672', '37.978086']
['corte madera', '4299', '3', '11/14/2014 12:33', '1533', '-122.488541', '37.919834']

In [9]:
# the column headers are the first row in the data file
# use next to iterate our csv reader to the first row to grab the headers
with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    my_csv = csv.reader(csvfile)
    headers = next(my_csv)
    print(headers)


['neighborhood', 'price', 'bedrooms', 'date', 'sqft', 'longitude', 'latitude']

In [10]:
# what is the 1st column (zero-indexed) in our data set?
headers[1]


Out[10]:
'price'

In [11]:
# for each row in the data set, print the price column's value
with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    my_csv = csv.reader(csvfile)
    for row in my_csv:
        print(row[1])


price
2495
2695
3150
2800
2196
3264
2000
4740
3395
2699
3620
2025

1795
4299

In [12]:
# create a new list to contain the column of prices in the data set
prices = []
with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    my_csv = csv.reader(csvfile)
    for row in my_csv:
        prices.append(row[1])  
prices


Out[12]:
['price',
 '2495',
 '2695',
 '3150',
 '2800',
 '2196',
 '3264',
 '2000',
 '4740',
 '3395',
 '2699',
 '3620',
 '2025',
 '',
 '1795',
 '4299']

This list has a couple of problems. First, it includes the header. Second, it's all strings even though prices are numeric data. Third, it contains some empty strings. We'll have to clean it up.


In [13]:
# to remove the first element of the list, we can just capture position 1 through the end of the list
prices = prices[1:]
prices


Out[13]:
['2495',
 '2695',
 '3150',
 '2800',
 '2196',
 '3264',
 '2000',
 '4740',
 '3395',
 '2699',
 '3620',
 '2025',
 '',
 '1795',
 '4299']

In [14]:
# now let's convert the price strings to integers
for price in prices:
    print(int(float(price)))


2495
2695
3150
2800
2196
3264
2000
4740
3395
2699
3620
2025
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-e7def979dcd8> in <module>()
      1 # now let's convert the price strings to integers
      2 for price in prices:
----> 3     print(int(float(price)))

ValueError: could not convert string to float: 

In [15]:
# you can't convert an empty string to a numeric type
for price in prices:
    if not price == '':
        print(int(float(price)))
    else:
        print('None')


2495
2695
3150
2800
2196
3264
2000
4740
3395
2699
3620
2025
None
1795
4299

In [16]:
# encapsulate this functionality inside a new function
def extract_int_price(price):
    if not price == '':
        return int(float(price))
    else:
        return None

In [17]:
# use our function to convert each element in the list of prices to an integer
for price in prices:
    print(extract_int_price(price))


2495
2695
3150
2800
2196
3264
2000
4740
3395
2699
3620
2025
None
1795
4299

In [18]:
# rather than just printing each converted value, turn it into a new list called int_prices
int_prices = []
for price in prices:
    int_prices.append(extract_int_price(price))
print(int_prices)


[2495, 2695, 3150, 2800, 2196, 3264, 2000, 4740, 3395, 2699, 3620, 2025, None, 1795, 4299]

In [19]:
# do the same thing, using list comprehension
int_prices = [ extract_int_price(price) for price in prices ]
print(int_prices)


[2495, 2695, 3150, 2800, 2196, 3264, 2000, 4740, 3395, 2699, 3620, 2025, None, 1795, 4299]

Now let's clean up our neighborhood names


In [20]:
# replace any forward slashes in neighborhood name with a hyphen
with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    my_csv = csv.reader(csvfile)
    next(my_csv) #skip the header row
    for row in my_csv:
        print(row[0].replace('/', '-')) #use string.replace() method


foster city
palo alto
brisbane
palo alto
san mateo
santa clara
san jose south
sunnyvale
inner sunset - UCSF
richmond - seacliff
SOMA - south beach
dublin - pleasanton - livermore
concord - pleasant hill - martinez
hercules, pinole, san pablo, el sob
corte madera

Create a new data set with cleaned up variables

first create a couple of functions to do the cleaning


In [21]:
# create a new function to convert bedrooms from a string to an int
def extract_int_bedrooms(bedrooms):
    if not bedrooms == '':
        return int(float(bedrooms))
    else:
        return None

In [22]:
# create a new function to replace forward slashes and commas with hyphens
def clean_neighborhood(neighborhood_name):
    # you can daisy chain multiple string.replace() methods
    return neighborhood_name.replace('/', '-').replace(',', '')

In [23]:
# clean the data set by calling the cleaning functions and save the results to variables
rentals_cleaned = []
with open('rents_raw.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    my_csv = csv.reader(csvfile)
    next(my_csv)
    for row in my_csv:
        neighborhood_cleaned = clean_neighborhood(row[0])
        price_cleaned = extract_int_price(row[1])
        bedrooms_cleaned = extract_int_bedrooms(row[2])
        rentals_cleaned.append([neighborhood_cleaned, price_cleaned, bedrooms_cleaned])      

# display our nested lists of data        
rentals_cleaned


Out[23]:
[['foster city', 2495, 1],
 ['palo alto', 2695, None],
 ['brisbane', 3150, 2],
 ['palo alto', 2800, 2],
 ['san mateo', 2196, 1],
 ['santa clara', 3264, 3],
 ['san jose south', 2000, 2],
 ['sunnyvale', 4740, 3],
 ['inner sunset - UCSF', 3395, 2],
 ['richmond - seacliff', 2699, 1],
 ['SOMA - south beach', 3620, 1],
 ['dublin - pleasanton - livermore', 2025, 1],
 ['concord - pleasant hill - martinez', None, 2],
 ['hercules pinole san pablo el sob', 1795, 1],
 ['corte madera', 4299, 3]]

What are the mean and max rainfall amounts?

Use rain.csv to calculate mean and maximum values in a column


In [24]:
# use round to round a number to a specified precision
round(3.14159265359, 5)


Out[24]:
3.14159

In [25]:
with open('rain.csv', 'r', encoding='utf-8', newline=None) as csvfile:
    
    # initialize a counter and variables to contain our descriptive stats
    count = 0 #at the end, divide cumulative_sum by this to get the mean
    cumulative_sum = 0 #our rolling sum
    max_value = -1 #pick a really small number that's guaranteed to be less than the max
    
    # open the file and skip the header row
    my_csv = csv.reader(csvfile)
    next(my_csv)
    
    # loop through each data row
    for row in my_csv:
        
        # rainfall amount is in column 1, only process this row's value if not an empty string
        if not row[1] == '':
            
            # increment the counter and extract this row's rainfall as a float
            count = count + 1
            rainfall = float(row[1])
            
            # add this row's rainfall to the cumulative sum
            cumulative_sum = cumulative_sum + rainfall
            
            # if this row's rainfall is greater than the current max value, update with the new max
            if rainfall > max_value:
                max_value = rainfall

    # after looping through all the rows, divide the cumulative sum by the count and round to get the mean
    mean_value = round(cumulative_sum / count, 1)
    
    # print out the mean and max values
    print('mean:', mean_value, 'inches')
    print('max:', max_value, 'inches')


mean: 3.7 inches
max: 5.9 inches

Now, how would you calculate the minimum?


In [ ]: