Data from https://openflights.org/data.html
In [90]:
import pandas as pd
In [121]:
# Read in the airports data.
airports = pd.read_csv("../data/airports.dat.txt", header=None, na_values=['\\N'], dtype=str)
# Read in the airlines data.
airlines = pd.read_csv("../data/airlines.dat.txt", header=None, na_values=['\\N'], dtype=str)
# Read in the routes data.
routes = pd.read_csv("../data/routes.dat.txt", header=None, na_values=['\\N'], dtype=str)
Assign column headers to the dataframe
In [122]:
airports.columns = ["id", "name", "city", "country", "code", "icao", "latitude",
"longitude", "altitude", "offset", "dst", "timezone", "type", "source"]
airlines.columns = ["id", "name", "alias", "iata", "icao", "callsign", "country", "active"]
routes.columns = ["airline", "airline_id", "source", "source_id", "dest",
"dest_id", "codeshare", "stops", "equipment"]
In [123]:
airports.head()
Out[123]:
In [124]:
airlines.head()
Out[124]:
In [125]:
routes.head()
Out[125]:
In [126]:
airports.head()
Out[126]:
In [127]:
airports.drop(['type', 'source'], axis=1, inplace=True)
In [128]:
airports.head()
Out[128]:
In [129]:
airports.shape
Out[129]:
Lets start by dropping redundant rows - in airlines data frame, we don't need id = -1
In [130]:
airlines.drop(0, axis=0, inplace=True)
In [131]:
airlines.shape
Out[131]:
In [132]:
airlines.head()
Out[132]:
In [133]:
def checkConsistency (s1, s2):
true_count = s1.isin(s2).sum()
total_count = s1.count()
consistency = true_count / total_count
return consistency
In [134]:
not(routes.airline_id.isin(airlines.id))
In [115]:
??missing
In [104]:
checkConsistency(routes.airline_id, airlines.id)
Out[104]:
In [105]:
checkConsistency(routes.source_id, airports.id)
Out[105]:
In [106]:
checkConsistency(routes.dest_id, airports.id)
Out[106]:
In [135]:
import missingno as msno
%matplotlib inline
In [136]:
msno.matrix(airlines)
In [137]:
msno.matrix(airports)
In [88]:
routes[routes["airline_id"] == "\\N"].count()
Out[88]:
In [89]:
routes = routes[routes["airline_id"] != "\\N"]
routes.shape
Out[89]:
In [ ]: