notebook.community

Edit and run



In [3]:

    
import pandas as pd
from pandas import *









    



C:\Users\M-SUDHAN\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\io\excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))



In [3]:

    
# FIRST EXERCISE: Counting Number of Lines for Each File (Bookings and Searches)



In [4]:

    
# Initialized a counter and with the help of file handling, every line is read and counter is increased by 1
cnt_booking=0
with open("C:/amadeus/bookings.csv") as infile:
    for line in infile:
        cnt_booking=cnt_booking+1
    print 'No of lines for Bookings: %d' % cnt_booking









    



No of lines for Bookings: 10000011



In [5]:

    
# Same as above, number of lines are read by looping through rows in the file
cnt_searches=0
with open("C:/amadeus/searches.csv") as infile:
    for line in infile:
        cnt_searches=cnt_searches+1
    print 'No of lines for Searches: %d' % cnt_searches









    



No of lines for Searches: 20390199



In [6]:

    
# Second exercise: top 10 arrival airports in the world in 2013 (using the bookings file)



In [7]:

    
# Displaying top rows



In [8]:

    
# Since file size is very big, it will be a good idea to read the file in chunk wise and understand the variables inside the file
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',iterator=True,chunksize=50000)



In [9]:

    
# It displays the top 5 chunk rows from the bookings file
df.get_chunk(5)









    Out[9]:






  
    
      
      act_date           
      source
      pos_ctry
      pos_iata
      pos_oid  
      rloc          
      cre_date           
      duration
      distance
      dep_port
      ...
      route          
      carrier
      bkg_class
      cab_class
      brd_time           
      off_time           
      pax
      year
      month
      oid      
    
  
  
    
      0
       2013-03-05 00:00:00
       1A    
       DE      
       a68dd7ae953c8acfb187a1af2dcbe123
       1a11ae49fcbf545fd2afc1a24d88d2b7
       ea65900e72d71f4626378e2ebd298267
       2013-02-22 00:00:00
         1708
       0
       ZRH     
      ...
       LHRZRH         
       VI
       T        
       Y        
       2013-03-07 08:50:00
       2013-03-07 11:33:37
      -1
       2013
       3
       NULL     
    
    
      1
       2013-03-26 00:00:00
       1A    
       US      
       e612b9eeeee6f17f42d9b0d3b79e75ca
       7437560d8f276d6d05eeb806d9e7edee
       737295a86982c941f1c2da9a46a14043
       2013-03-26 00:00:00
       135270
       0
       SAL     
      ...
       SALATLCLT      
       NV
       L        
       Y        
       2013-04-12 13:04:00
       2013-04-12 22:05:40
       1
       2013
       3
       NULL     
    
    
      2
       2013-03-26 00:00:00
       1A    
       US      
       e612b9eeeee6f17f42d9b0d3b79e75ca
       7437560d8f276d6d05eeb806d9e7edee
       737295a86982c941f1c2da9a46a14043
       2013-03-26 00:00:00
       135270
       0
       SAL     
      ...
       CLTATLSAL      
       NV
       U        
       Y        
       2013-07-15 07:00:00
       2013-07-15 11:34:51
       1
       2013
       3
       NULL     
    
    
      3
       2013-03-26 00:00:00
       1A    
       AU      
       0f984b3bb6bd06661c95529bbd6193bc
       36472c6dbaf7afec9136ac40364e2794
       5ecf00fdcbcec761c43dc7285253d0c1
       2013-03-26 00:00:00
        30885
       0
       AKL     
      ...
       AKLHKGSVO      
       XK
       G        
       Y        
       2013-04-24 23:59:00
       2013-04-25 16:06:31
       1
       2013
       3
       SYDA82546
    
    
      4
       2013-03-26 00:00:00
       1A    
       AU      
       0f984b3bb6bd06661c95529bbd6193bc
       36472c6dbaf7afec9136ac40364e2794
       5ecf00fdcbcec761c43dc7285253d0c1
       2013-03-26 00:00:00
        30885
       0
       AKL     
      ...
       SVOHKGAKL      
       XK
       G        
       Y        
       2013-05-14 20:15:00
       2013-05-16 10:44:50
       1
       2013
       3
       SYDA82546
    
  

5 rows × 38 columns



In [10]:

    
# File is loaded into the memory using Pandas dataframe.columns 'arr_port' and 'pax' are loaded
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])



In [11]:

    
# To find the total number of passengers at arrival airport, dataframe is grouped by arr_port by summing the passengers
# (including cancellations)
dfg=df.groupby('arr_port').sum()
print 'Top 10 arrival airports in the world in 2013'
dfg.sort(['pax'],ascending=[0]).head(10)









    



Top 10 arrival airports in the world in 2013






    Out[11]:






  
    
      
      pax
    
    
      arr_port
      
    
  
  
    
      LHR     
       88809
    
    
      MCO     
       70930
    
    
      LAX     
       70530
    
    
      LAS     
       69630
    
    
      JFK     
       66270
    
    
      CDG     
       64490
    
    
      BKK     
       59460
    
    
      MIA     
       58150
    
    
      SFO     
       58000
    
    
      DXB     
       55590



In [12]:

    
# Finding City and Airport names with corresponding to airport



In [22]:

    
# Module Geobases is imported in to IPython Notebook and required database ('ori_por') is loaded
from GeoBases import GeoBase
geo_o = GeoBase(data='ori_por', verbose=False)



In [14]:

    
# Grouped dataframe is then sorted, index is reset to display all headers in one line and finally top 10 airports are considered
df_geo=dfg.sort(['pax'],ascending=[0]).head(10).add_suffix('_total').reset_index()



In [15]:

    
# Since there are white spaces in column headers file, it is required to trim the header before performing the analysis



In [16]:

    
df_geo['cleansed']=df_geo.arr_port.str.strip(' ')
df_geo.drop('arr_port', axis=1, inplace=True)
df_geo.rename(columns={'cleansed':'arr_port'}, inplace=True)
df_geo['Airport_Name']=df_geo['arr_port']
df_geo['City']=df_geo['arr_port']



In [17]:

    
# Using Geobases, City and Airport Names are mapped using arr_port as the identifier from the bookings file



In [18]:

    
i=0;
while i < df_geo.arr_port.count():
    df_geo['Airport_Name'][i]=geo_o.get(df_geo['arr_port'][i], 'name')
    df_geo['City'][i]=geo_o.get(df_geo['arr_port'][i], 'city_code')
    i=i+1
df_geo=df_geo[['arr_port', 'pax_total', 'Airport_Name', 'City']]



In [19]:

    
print 'Top 10 arrival airports in the world in 2013 with Airport and City Names'
df_geo









    



Top 10 arrival airports in the world in 2013 with Airport and City Names






    Out[19]:






  
    
      
      arr_port
      pax_total
      Airport_Name
      City
    
  
  
    
      0
       LHR
       88809
                    London Heathrow Airport
       LON
    
    
      1
       MCO
       70930
              Orlando International Airport
       ORL
    
    
      2
       LAX
       70530
          Los Angeles International Airport
       LAX
    
    
      3
       LAS
       69630
            Mc Carran International Airport
       LAS
    
    
      4
       JFK
       66270
       John F Kennedy International Airport
       NYC
    
    
      5
       CDG
       64490
                  Paris - Charles-de-Gaulle
       PAR
    
    
      6
       BKK
       59460
                               Suvarnabhumi
       BKK
    
    
      7
       MIA
       58150
                Miami International Airport
       MIA
    
    
      8
       SFO
       58000
        San Francisco International Airport
       SFO
    
    
      9
       DXB
       55590
                Dubai International Airport
       DXB



In [20]:

    
# Third exercise: plot the monthly number of searches for flights arriving at Málaga, Madrid or Barcelona



In [1]:

    
import matplotlib.pyplot as plt
import pylab
import numpy as np
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['pylab']
`%matplotlib` prevents importing * from pylab and numpy



In [3]:

    
# Displaying the first two chunk rows to understand the data
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',iterator=True,chunksize=50000)



In [4]:

    
df_search.get_chunk(2)









    Out[4]:






  
    
      
      Date
      Time
      TxnCode
      OfficeID
      Country
      Origin
      Destination
      RoundTrip
      NbSegments
      Seg1Departure
      ...
      Seg6Arrival
      Seg6Date
      Seg6Carrier
      Seg6BookingCode
      From
      IsPublishedForNeg
      IsFromInternet
      IsFromVista
      TerminalID
      InternetOffice
    
  
  
    
      0
       2013-01-01
       20:25:57
       MPT
       624d8c3ac0b3a7ca03e3c167e0f48327
       DE
       TXL
       AUH
       1
       2
       TXL
      ...
      NaN
      NaN
      NaN
      NaN
       1ASIWS
       0
       0
       0
       d41d8cd98f00b204e9800998ecf8427e
       FRA
    
    
      1
       2013-01-01
       10:15:33
       MPT
       b0af35b31588dc4ab06d5cf2986e8e02
       MD
       ATH
       MIL
       0
       1
       ATH
      ...
      NaN
      NaN
      NaN
      NaN
       1ASIWS
       0
       0
       0
       d41d8cd98f00b204e9800998ecf8427e
       KIV
    
  

2 rows × 45 columns



In [23]:

    
# Using geobases, iata codes are mapped to understand the city names
geo_o.get('AGP', 'name')









    Out[23]:





'M\xc3\xa1laga Airport'



In [24]:

    
geo_o.get('MAD', 'name')









    Out[24]:





'Madrid / Barajas'



In [25]:

    
geo_o.get('BCN', 'name')









    Out[25]:





'Barcelona\xe2\x80\x93El Prat Airport'



In [4]:

    
# Dataframe is loaded with date and destination in to the memory
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Destination'])



In [5]:

    
# Dataframe is further filtered for Barcelona, Madrid and Malaga
df2=df_search[df_search['Destination'].isin(['BCN', 'MAD', 'AGP'])]



In [6]:

    
# Counter is intialized and added as a separate column to get the total number of searches
df2['Searches']=1



In [7]:

    
# Below operation is performed to get the month from date field
df2['month'] = pd.DatetimeIndex(df2['Date']).month



In [8]:

    
#size of figure is initialzed for the plot
rcParams['figure.figsize'] = 12, 5



In [19]:

    
# dataframe is grouped by month and destination to get the total number of searches
df3= df2.groupby(['Destination','month']).sum().add_suffix('_total').reset_index()



In [20]:

    
df3.head()









    Out[20]:






  
    
      
      Destination
      month
      Searches_total
    
  
  
    
      0
       AGP
       1
        9633
    
    
      1
       AGP
       2
        8379
    
    
      2
       AGP
       3
       10659
    
    
      3
       AGP
       4
        8265
    
    
      4
       AGP
       5
       10830



In [22]:

    
# Extracting dataframes for individual places
df_agp=df3[df3['Destination']=='AGP']
df_bcn=df3[df3['Destination']=='BCN']
df_mad=df3[df3['Destination']=='MAD']



In [30]:

    
# plotting the monthly searches for three cities using subplots
rcParams['figure.figsize'] = 15, 8
fig, axes = plt.subplots(nrows=2, ncols=2)

df_agp.groupby(['month']).sum().plot(ax=axes[0,0],title='Malaga',kind='bar',color='blue')
df_bcn.groupby(['month']).sum().plot(ax=axes[0,1],title='Spain',kind='bar',color='yellow')
df_mad.groupby(['month']).sum().plot(ax=axes[1,0],title='Madrid',kind='bar',color='grey')









    Out[30]:





<matplotlib.axes.AxesSubplot at 0xc8109d0>



In [ ]:



In [44]:

    
# Bonus exercise: write a Web Service



In [45]:

    
# Web Service with Flask and Json for Top 10 arrival airports in the world in 2013



In [46]:

    
import json



In [46]:

    
# Datframe is loaded with bookings data
df_air=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])



In [47]:

    
# a function is defined to return the top n airports in json format
def top_airports(n):
    dfg=df_air.groupby('arr_port').sum()
    dfg2=dfg.sort(['pax'],ascending=[0]).head(int(n)).add_suffix('_total').reset_index()
    df_json=dfg2.to_json(orient='index')
    return df_json



In [48]:

    
# Flask is imported in to the ipython notebook to create a web service
from flask import Flask, url_for, Response
app = Flask(__name__)


@app.route('/api/v1/top_airports/<n>', methods = ['GET'])
def top_airport(n):
    df_json=top_airports(n)
    js = json.dumps(df_json)

    resp = Response(df_json, status=200, mimetype='application/json')
    
    return resp    

if __name__ == '__main__':
    app.run()



In [49]:

    
# Type the below URL in a Browser with N value. For example for top 20 airports, type as below
# http://localhost:5000/api/v1/top_airports/20

# Below output is returned in the Browser

# {"0":{"arr_port":"LHR     ","pax_total":88809.0},"1":{"arr_port":"MCO     ","pax_total":70930.0},"2":{"arr_port":"LAX    
#","pax_total":70530.0},"3":{"arr_port":"LAS     ","pax_total":69630.0},"4":{"arr_port":"JFK     ","pax_total":66270.0},
#"5":{"arr_port":"CDG     ","pax_total":64490.0},"6":{"arr_port":"BKK     ","pax_total":59460.0},"7":{"arr_port":"MIA     
#","pax_total":58150.0},"8":{"arr_port":"SFO     ","pax_total":58000.0},"9":{"arr_port":"DXB     ","pax_total":55590.0},
#"10":{"arr_port":"ORD     ","pax_total":53200.0},"11":{"arr_port":"FCO     ","pax_total":45120.0},"12":{"arr_port":"IST    
#","pax_total":43850.0},"13":{"arr_port":"DFW     ","pax_total":42420.0},"14":{"arr_port":"CUN     ","pax_total":42000.0},"15"
#:{"arr_port":"MAD     ","pax_total":41410.0},"16":{"arr_port":"BCN     ","pax_total":41200.0},"17":{"arr_port":"LGA    
#","pax_total":40712.0},"18":{"arr_port":"ATL     ","pax_total":39780.0},"19":{"arr_port":"EWR     ","pax_total":36890.0}}



In [47]:

    
# Bonus exercise: match searches with bookings including cancellations



In [4]:

    
df_bookings=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','dep_port'])



In [5]:

    
df_bookings['flag']=1



In [6]:

    
df_bookings['arr_port']=df_bookings.arr_port.str.strip(' ')



In [7]:

    
df_bookings['dep_port']=df_bookings.dep_port.str.strip(' ')



In [46]:

    
df_bookings.rename(columns={'arr_port':'Destination','dep_port':'Origin'}, inplace=True)



In [46]:

    
df_searches=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Origin','Destination'])



In [46]:

    
df_merge = pd.merge(df_searches, df_bookings, how='left', on=['Origin','Destination'])



In [46]:

    
df_merge=df_merge.fillna(0)



In [47]:

    
df_merge.to_csv('Converted_Searches.csv')



In [49]:

    
# In the output, there will be extra column called flag, when flag=1(search converted to booking) ,flag=0 (search not converted to booking)



In [50]:

    
#--------------------------------------End of Exercises including Bonus Exercises--------------------------------------------------------

	act_date	source	pos_ctry	pos_iata	pos_oid	rloc	cre_date	duration	dep_port	...	route	carrier	bkg_class	cab_class	brd_time	off_time	pax	year	month	oid
0	2013-03-05 00:00:00	1A	DE	a68dd7ae953c8acfb187a1af2dcbe123	1a11ae49fcbf545fd2afc1a24d88d2b7	ea65900e72d71f4626378e2ebd298267	2013-02-22 00:00:00	1708	ZRH	...	LHRZRH	VI	T	Y	2013-03-07 08:50:00	2013-03-07 11:33:37	-1	2013	3	NULL
1	2013-03-26 00:00:00	1A	US	e612b9eeeee6f17f42d9b0d3b79e75ca	7437560d8f276d6d05eeb806d9e7edee	737295a86982c941f1c2da9a46a14043	2013-03-26 00:00:00	135270	SAL	...	SALATLCLT	NV	L	Y	2013-04-12 13:04:00	2013-04-12 22:05:40	1	2013	3	NULL
2	2013-03-26 00:00:00	1A	US	e612b9eeeee6f17f42d9b0d3b79e75ca	7437560d8f276d6d05eeb806d9e7edee	737295a86982c941f1c2da9a46a14043	2013-03-26 00:00:00	135270	SAL	...	CLTATLSAL	NV	U	Y	2013-07-15 07:00:00	2013-07-15 11:34:51	1	2013	3	NULL
3	2013-03-26 00:00:00	1A	AU	0f984b3bb6bd06661c95529bbd6193bc	36472c6dbaf7afec9136ac40364e2794	5ecf00fdcbcec761c43dc7285253d0c1	2013-03-26 00:00:00	30885	AKL	...	AKLHKGSVO	XK	G	Y	2013-04-24 23:59:00	2013-04-25 16:06:31	1	2013	3	SYDA82546
4	2013-03-26 00:00:00	1A	AU	0f984b3bb6bd06661c95529bbd6193bc	36472c6dbaf7afec9136ac40364e2794	5ecf00fdcbcec761c43dc7285253d0c1	2013-03-26 00:00:00	30885	AKL	...	SVOHKGAKL	XK	G	Y	2013-05-14 20:15:00	2013-05-16 10:44:50	1	2013	3	SYDA82546

	pax
arr_port
LHR	88809
MCO	70930
LAX	70530
LAS	69630
JFK	66270
CDG	64490
BKK	59460
MIA	58150
SFO	58000
DXB	55590

	arr_port	pax_total	Airport_Name	City
0	LHR	88809	London Heathrow Airport	LON
1	MCO	70930	Orlando International Airport	ORL
2	LAX	70530	Los Angeles International Airport	LAX
3	LAS	69630	Mc Carran International Airport	LAS
4	JFK	66270	John F Kennedy International Airport	NYC
5	CDG	64490	Paris - Charles-de-Gaulle	PAR
6	BKK	59460	Suvarnabhumi	BKK
7	MIA	58150	Miami International Airport	MIA
8	SFO	58000	San Francisco International Airport	SFO
9	DXB	55590	Dubai International Airport	DXB

	Date	Time	TxnCode	OfficeID	Country	Origin	Destination	RoundTrip	NbSegments	Seg1Departure	...	Seg6Arrival	Seg6Date	Seg6Carrier	Seg6BookingCode	From	IsPublishedForNeg	IsFromInternet	IsFromVista	TerminalID	InternetOffice
0	2013-01-01	20:25:57	MPT	624d8c3ac0b3a7ca03e3c167e0f48327	DE	TXL	AUH	1	2	TXL	...	NaN	NaN	NaN	NaN	1ASIWS	0	0	0	d41d8cd98f00b204e9800998ecf8427e	FRA
1	2013-01-01	10:15:33	MPT	b0af35b31588dc4ab06d5cf2986e8e02	MD	ATH	MIL	0	1	ATH	...	NaN	NaN	NaN	NaN	1ASIWS	0	0	0	d41d8cd98f00b204e9800998ecf8427e	KIV