In [2]:
import pandas as pd
from pandas import *


C:\Users\M-SUDHAN\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\io\excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

In [3]:
# FIRST EXERCISE: Counting Number of Lines for Each File (Bookings and Searches)

In [4]:
cnt_booking=0
with open("C:/amadeus/bookings.csv") as infile:
    for line in infile:
        cnt_booking=cnt_booking+1
    print 'No of lines for Bookings: %d' % cnt_booking


No of lines for Bookings: 10000011

In [5]:
cnt_searches=0
with open("C:/amadeus/searches.csv") as infile:
    for line in infile:
        cnt_searches=cnt_searches+1
    print 'No of lines for Searches: %d' % cnt_searches


No of lines for Searches: 20390199

In [6]:
# Second exercise: top 10 arrival airports in the world in 2013 (using the bookings file)

In [7]:
# Displaying top rows

In [8]:
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',iterator=True,chunksize=50000)

In [9]:
df.get_chunk(5)


Out[9]:
act_date source pos_ctry pos_iata pos_oid rloc cre_date duration distance dep_port ... route carrier bkg_class cab_class brd_time off_time pax year month oid
0 2013-03-05 00:00:00 1A DE a68dd7ae953c8acfb187a1af2dcbe123 1a11ae49fcbf545fd2afc1a24d88d2b7 ea65900e72d71f4626378e2ebd298267 2013-02-22 00:00:00 1708 0 ZRH ... LHRZRH VI T Y 2013-03-07 08:50:00 2013-03-07 11:33:37 -1 2013 3 NULL
1 2013-03-26 00:00:00 1A US e612b9eeeee6f17f42d9b0d3b79e75ca 7437560d8f276d6d05eeb806d9e7edee 737295a86982c941f1c2da9a46a14043 2013-03-26 00:00:00 135270 0 SAL ... SALATLCLT NV L Y 2013-04-12 13:04:00 2013-04-12 22:05:40 1 2013 3 NULL
2 2013-03-26 00:00:00 1A US e612b9eeeee6f17f42d9b0d3b79e75ca 7437560d8f276d6d05eeb806d9e7edee 737295a86982c941f1c2da9a46a14043 2013-03-26 00:00:00 135270 0 SAL ... CLTATLSAL NV U Y 2013-07-15 07:00:00 2013-07-15 11:34:51 1 2013 3 NULL
3 2013-03-26 00:00:00 1A AU 0f984b3bb6bd06661c95529bbd6193bc 36472c6dbaf7afec9136ac40364e2794 5ecf00fdcbcec761c43dc7285253d0c1 2013-03-26 00:00:00 30885 0 AKL ... AKLHKGSVO XK G Y 2013-04-24 23:59:00 2013-04-25 16:06:31 1 2013 3 SYDA82546
4 2013-03-26 00:00:00 1A AU 0f984b3bb6bd06661c95529bbd6193bc 36472c6dbaf7afec9136ac40364e2794 5ecf00fdcbcec761c43dc7285253d0c1 2013-03-26 00:00:00 30885 0 AKL ... SVOHKGAKL XK G Y 2013-05-14 20:15:00 2013-05-16 10:44:50 1 2013 3 SYDA82546

5 rows × 38 columns


In [10]:
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])

In [11]:
dfg=df.groupby('arr_port').sum()
print 'Top 10 arrival airports in the world in 2013'
dfg.sort(['pax'],ascending=[0]).head(10)


Top 10 arrival airports in the world in 2013
Out[11]:
pax
arr_port
LHR 88809
MCO 70930
LAX 70530
LAS 69630
JFK 66270
CDG 64490
BKK 59460
MIA 58150
SFO 58000
DXB 55590

In [12]:
# Finding City and Airport names with corresponding to airport

In [13]:
from GeoBases import GeoBase
geo_o = GeoBase(data='ori_por', verbose=False)

In [14]:
df_geo=dfg.sort(['pax'],ascending=[0]).head(10).add_suffix('_total').reset_index()

In [15]:
# Cleaning the white spaces

In [16]:
df_geo['cleansed']=df_geo.arr_port.str.strip(' ')
df_geo.drop('arr_port', axis=1, inplace=True)
df_geo.rename(columns={'cleansed':'arr_port'}, inplace=True)
df_geo['Airport_Name']=df_geo['arr_port']
df_geo['City']=df_geo['arr_port']

In [17]:
# Matching City and Airport Names

In [18]:
i=0;
while i < df_geo.arr_port.count():
    df_geo['Airport_Name'][i]=geo_o.get(df_geo['arr_port'][i], 'name')
    df_geo['City'][i]=geo_o.get(df_geo['arr_port'][i], 'city_code')
    i=i+1
df_geo=df_geo[['arr_port', 'pax_total', 'Airport_Name', 'City']]

In [19]:
print 'Top 10 arrival airports in the world in 2013 with Airport and City Names'
df_geo


Top 10 arrival airports in the world in 2013 with Airport and City Names
Out[19]:
arr_port pax_total Airport_Name City
0 LHR 88809 London Heathrow Airport LON
1 MCO 70930 Orlando International Airport ORL
2 LAX 70530 Los Angeles International Airport LAX
3 LAS 69630 Mc Carran International Airport LAS
4 JFK 66270 John F Kennedy International Airport NYC
5 CDG 64490 Paris - Charles-de-Gaulle PAR
6 BKK 59460 Suvarnabhumi BKK
7 MIA 58150 Miami International Airport MIA
8 SFO 58000 San Francisco International Airport SFO
9 DXB 55590 Dubai International Airport DXB

In [20]:
# Third exercise: plot the monthly number of searches for flights arriving at Málaga, Madrid or Barcelona

In [35]:
import matplotlib
import pylab
import numpy as np
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['pylab']
`%matplotlib` prevents importing * from pylab and numpy

In [22]:
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',iterator=True,chunksize=50000)

In [23]:
df_search.get_chunk(2)


Out[23]:
Date Time TxnCode OfficeID Country Origin Destination RoundTrip NbSegments Seg1Departure ... Seg6Arrival Seg6Date Seg6Carrier Seg6BookingCode From IsPublishedForNeg IsFromInternet IsFromVista TerminalID InternetOffice
0 2013-01-01 20:25:57 MPT 624d8c3ac0b3a7ca03e3c167e0f48327 DE TXL AUH 1 2 TXL ... NaN NaN NaN NaN 1ASIWS 0 0 0 d41d8cd98f00b204e9800998ecf8427e FRA
1 2013-01-01 10:15:33 MPT b0af35b31588dc4ab06d5cf2986e8e02 MD ATH MIL 0 1 ATH ... NaN NaN NaN NaN 1ASIWS 0 0 0 d41d8cd98f00b204e9800998ecf8427e KIV

2 rows × 45 columns


In [24]:
geo_o.get('AGP', 'name')


Out[24]:
'M\xc3\xa1laga Airport'

In [25]:
geo_o.get('MAD', 'name')


Out[25]:
'Madrid / Barajas'

In [26]:
geo_o.get('BCN', 'name')


Out[26]:
'Barcelona\xe2\x80\x93El Prat Airport'

In [27]:
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Destination'])

In [28]:
df2=df_search[df_search['Destination'].isin(['BCN', 'MAD', 'AGP'])]

In [29]:
df2['Searches']=1

In [30]:
df2['month'] = pd.DatetimeIndex(df2['Date']).month

In [42]:
rcParams['figure.figsize'] = 20, 15

In [32]:
df3= df2.groupby(['month','Destination']).sum()

In [33]:
df3.head()


Out[33]:
Searches
month Destination
1 AGP 9633
BCN 29469
MAD 24258
2 AGP 8379
BCN 28329

In [43]:
df3.plot(kind='bar')


Out[43]:
<matplotlib.axes.AxesSubplot at 0x2cb17190>

In [44]:
# Bonus exercise: write a Web Service

In [45]:
# Web Service with Flask and Json for Top 10 arrival airports in the world in 2013

In [46]:
import json

In [46]:
df_air=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])

In [47]:
def top_airports(n):
    dfg=df_air.groupby('arr_port').sum()
    dfg2=dfg.sort(['pax'],ascending=[0]).head(int(n)).add_suffix('_total').reset_index()
    df_json=dfg2.to_json(orient='index')
    return df_json

In [48]:
from flask import Flask, url_for, Response
app = Flask(__name__)


@app.route('/api/v1/top_airports/<n>', methods = ['GET'])
def top_airport(n):
    df_json=top_airports(n)
    js = json.dumps(df_json)

    resp = Response(df_json, status=200, mimetype='application/json')
    
    return resp    

if __name__ == '__main__':
    app.run()

In [49]:
# Type the below URL in a Browser with N value. For example for top 20 airports, type as below
# http://localhost:5000/api/v1/top_airports/20

# Below output is returned in the Browser

# {"0":{"arr_port":"LHR     ","pax_total":88809.0},"1":{"arr_port":"MCO     ","pax_total":70930.0},"2":{"arr_port":"LAX    
#","pax_total":70530.0},"3":{"arr_port":"LAS     ","pax_total":69630.0},"4":{"arr_port":"JFK     ","pax_total":66270.0},
#"5":{"arr_port":"CDG     ","pax_total":64490.0},"6":{"arr_port":"BKK     ","pax_total":59460.0},"7":{"arr_port":"MIA     
#","pax_total":58150.0},"8":{"arr_port":"SFO     ","pax_total":58000.0},"9":{"arr_port":"DXB     ","pax_total":55590.0},
#"10":{"arr_port":"ORD     ","pax_total":53200.0},"11":{"arr_port":"FCO     ","pax_total":45120.0},"12":{"arr_port":"IST    
#","pax_total":43850.0},"13":{"arr_port":"DFW     ","pax_total":42420.0},"14":{"arr_port":"CUN     ","pax_total":42000.0},"15"
#:{"arr_port":"MAD     ","pax_total":41410.0},"16":{"arr_port":"BCN     ","pax_total":41200.0},"17":{"arr_port":"LGA    
#","pax_total":40712.0},"18":{"arr_port":"ATL     ","pax_total":39780.0},"19":{"arr_port":"EWR     ","pax_total":36890.0}}

In [47]:
# Bonus exercise: match searches with bookings including cancellations

In [4]:
df_bookings=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','dep_port'])

In [5]:
df_bookings['flag']=1

In [6]:
df_bookings['arr_port']=df_bookings.arr_port.str.strip(' ')

In [7]:
df_bookings['dep_port']=df_bookings.dep_port.str.strip(' ')

In [46]:
df_bookings.rename(columns={'arr_port':'Destination','dep_port':'Origin'}, inplace=True)

In [46]:
df_searches=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Origin','Destination'])

In [46]:
df_merge = pd.merge(df_searches, df_bookings, how='left', on=['Origin','Destination'])

In [46]:
df_merge=df_merge.fillna(0)

In [47]:
df_merge.to_csv('Converted_Searches.csv')

In [49]:
# In the output, there will be extra column called flag, when flag=1(search converted to booking) ,flag=0 (search not converted to booking)

In [50]:
#--------------------------------------End of Exercises including Bonus Exercises--------------------------------------------------------