Beschreibung der Domäne, die auf Basis der relationalen DB gewünscht wird:
- Über unterschiedliche (auch in Stationen) können Buchungen betätigt werden.
- Jede Station hat einen Betreiber
- In einer Buchung wird ein Fahrzeug ausgeliehen.
- Das Fahrzeug wird von einer Station abgeholt und in einer Station abgegeben
- Jede Buchung ist einer Tarifklasse zugeordnet.
- Fahrzeuge haben Eigenschaften
- Stationen haben Eigenschaften
- Tarifklassen haben Eigenschaften
- Buchungen haben Eigenschaften
Entscheidungen
- Fahrzeug-[wird abgeholt]->Station
- Fahrzeug-[wird abgegeben]->Station
- Buchung-[bezieht sich]->Fahrzeug
- Firma-[betreibt]->Station
- Firma-[gehoert zu]-> Gruppe
- Buchung-[hat]->Tarifklasse
- Tarifklasse-[ist untergeordnet zu]->Haupttarif
- Fahrzeuge unterscheiden sich durch
- Marke
- Kraftstoff
- Buchungen unterscheiden sich durch
- Tarifklasse
- Fahrstrecke
- Buchungsquelle
- Tarifklassen unterscheiden sich durch
- Haupttarif
In [81]:
import pandas as pd
import numpy as np
seperatingLine = "\n########################################################################################################\n"
defaultCsvItemDelimiter = ","
In [82]:
def writeDsvFile(df, typeName, delimiter, columnsList, headerList):
filename = './output/' + typeName + '.dsv'
df.to_csv(filename, index = False, sep = delimiter, columns = columnsList, header = headerList)
def trimName(longName):
trimmedName = ''
if longName == np.nan:
trimmedName = 'nan'
else:
#print(longName)
#print(longName.split())
#print(longName.split()[0])
#print(longName[:longName.find(" ")])
trimmedName = repr(longName).replace(' ', '_').replace('(','').replace(')','').replace('/','')
#print(trimmedName)
return trimmedName
In [83]:
def getLabel(row, nodeName):
switcher = {
"CATEGORY": "TARIFF_CLASS",
"PARENT_CATEGORY": "MAIN_TARIFF_CLASS",
"BOOKING": "BOOKING",
"VEHICLE": "VEHICLE",
"RENTAL_ZONE": "STATION"
}
label = switcher.get(nodeName, 'OBJ')
if nodeName == 'BOOKING':
label += ";" + str(row["PARENT_CATEGORY"])
label += ";" + str(row["INCOME_CHANNEL_TYPE"])
label += ";" + str(row["INCOME_CHANNEL_GROUP"])
#label += ";" + str(row["VEHICLE_MANUFACTURER_NAME"])
label += ";" + str(row["DISTANCE_CATEGORY"])
elif nodeName == 'VEHICLE':
fuelTypeOrig = str(row["FUEL_TYPE_NAME"])
fuelTypeSwitcher = {
"Diesel": "DIESEL",
"Erdgas (Nottank: Super)": "ERDGAS",
"Plug In (Strom, Super)": "PLUGIN",
"Strom": "STROM",
"Super (Benzin)": "SUPER",
"Super E10": "E10"
}
fuelType = fuelTypeSwitcher.get(fuelTypeOrig, 'UNKNOWN_FT')
label += \
";" + str(row["VEHICLE_MODEL_TYPE"].upper()) + \
";" + str(row["VEHICLE_MANUFACTURER_NAME"].upper()) + \
";" + fuelType;
elif nodeName == 'RENTAL_ZONE':
label += ";" + str(row["TYPE"].upper())
label += ";ACTIVE" if str(row["ACTIVE_X"].upper()) == "JA" else ";INACTIVE"
label = label.replace(' ', '_').replace('(','').replace(')','').replace('/','').replace('-','_')
return label
In [84]:
def getDistanceCategory(row):
distance = float(row["DISTANCE"])
distanceCategory = 'SHORT_DISTANCE'
if 500 > distance > 100:
distanceCategory = 'MIDDLE_DISTANCE'
elif distance > 500:
distanceCategory = 'LONG_DISTANCE'
return distanceCategory
In [85]:
def getRelationshipType(row, types):
switcher = {
"CATEGORY": {
"PARENT_CATEGORY": "BELONGS_TO"
},
"BOOKING": {
"VEHICLE": "REFERS_TO",
"CATEGORY": "ACCORDING_TO"
},
"VEHICLE": {
"RENTAL_ZONE": "WAS_BOOKED_IN"
}
}
relType = switcher.get(types[0], {}).get(types[1], 'UNDEFINED')
return relType
In [86]:
def getTechnicalIncomeChannelType(row):
channelToType = {
"Internet": "Internet",
"BwCarsharing iPhone": "App",
"Bahn_de_2": "Internet",
"Multicity iPhone": "App",
"Multicity Android": "App",
"Onesto_Bahn": "Internet",
"ford2go Android": "App",
"Flinkster iPhone": "App",
"ICS-Server": "Internet",
"ford2go iPhone": "App",
"BwCarsharing Android": "App",
"Flinkster Windows": "App",
"einfachMobil iPhone": "App",
"API": "Internet",
"Flinkster Android": "App",
"teilAuto": "App",
"Scouter 255 Android 2Denker": "App",
"Scouter 255 Web Praegnanz": "Internet",
"einfachMobil Android": "App",
"Scouter 255 iOS 2Denker": "App",
"Scouter 255 Windows 2Denker": "App",
"BwFPS Dispo Testzugang": "Internet",
"BwFPS Portal Web": "Internet",
"Broker HAL": "RentalZone",
"Book-n-Drive iPhone": "App",
"BwFPS Dispotool": "Internet",
"HALAPI Teilauto": "Internet",
"Book-n-Drive Android": "App",
"Flinkster - Mobility Map": "Internet",
"BwCarsharing WindowsPhone": "App",
"Flinkster Connect": "Internet",
"Flinkster E-Wald": "Internet",
"Ford2Go Web": "Internet",
"Flinkster Carjump": "App",
"Ford Carsharing FordPass": "App",
"EMIL Carsharing": "Internet",
"Flinkster Drive Carsharing": "Internet",
"Stuttgart Service Card": "Mobility"
}
ctype = channelToType.get(str(row["TECHNICAL_INCOME_CHANNEL"]), "UNKNOWN_TICHT").upper()
return ctype
In [87]:
def getTechnicalIncomeChannelGroup(row):
channelToGroup = {
"API": "DB",
"Bahn_de_2": "DB",
"Book-n-Drive Android": "Book-n-Drive",
"Book-n-Drive iPhone": "Book-n-Drive",
"Broker HAL": "DB",
"BwCarsharing Android": "BwCarsharing",
"BwCarsharing WindowsPhone": "BwCarsharing",
"BwCarsharing iPhone": "BwCarsharing",
"BwFPS Dispo Testzugang": "BwFPS",
"BwFPS Dispotool": "BwFPS",
"BwFPS Portal Web": "BwFPS",
"EMIL Carsharing": "EMIL",
"Flinkster - Mobility Map": "Flinkster",
"Flinkster Android": "Flinkster",
"Flinkster Carjump": "Flinkster",
"Flinkster Connect": "Flinkster",
"Flinkster Drive Carsharing": "Flinkster",
"Flinkster E-Wald": "Flinkster",
"Flinkster Windows": "Flinkster",
"Flinkster iPhone": "Flinkster",
"Ford Carsharing FordPass": "FordPass",
"Ford2Go Web": "ford2go",
"HALAPI Teilauto": "Teilauto",
"ICS-Server": "DB",
"Internet": "DB",
"Multicity Android": "Multicity",
"Multicity iPhone": "Multicity",
"Onesto_Bahn": "DB",
"Scouter 255 Android 2Denker": "Scouter255",
"Scouter 255 Web Praegnanz": "Scouter255",
"Scouter 255 Windows 2Denker": "Scouter255",
"Scouter 255 iOS 2Denker": "Scouter255",
"Stuttgart Service Card": "S_ServiceCard",
"einfachMobil Android": "einfachMobil",
"einfachMobil iPhone": "einfachMobil",
"ford2go Android": "ford2go",
"ford2go iPhone": "ford2go",
"teilAuto": "Teilauto"
}
group = channelToGroup.get(str(row["TECHNICAL_INCOME_CHANNEL"]), "UNKNOWN_TICHG").upper()
return group
Vorgehensweise:
- Wir wollen auch ein Label für die Buchungen realisieren, um für die Analyse eine einfacheres Mittel zu haben.
- Aus dem Grund macht es Sinn, neben den Tarifen auch Haupttarife zu definieren, die dann in einer Buchung als Label aufgeführt wird.
- Deswegen sind folgende Schritte notwendig
- Erster Schritt ist die Reduzierung der Tarifbezeichung auf ein Wort (als Name des Haupttarifs)
- Zweiter Schritt ist die Aufnahme in Buchung als zusätzliche Spalte
- Tarif und Haupttarife haben ein festes Label
- Teilen sich denselben Nummernsraum als Primärschlüssel
In [88]:
def processCategoryInformations():
df_cat = pd.read_csv('./datasets/OPENDATA_CATEGORY_CARSHARING.csv', quotechar='"',encoding ='utf-8', sep=';')
print('%s Original data of category informations %s' %(seperatingLine, seperatingLine))
print(df_cat.head())
null_columns=df_cat.columns[df_cat.isnull().any()]
print('%s Column with null values in category informations %s' %(seperatingLine, seperatingLine))
print(df_cat[null_columns].isnull().sum())
print('%s Unique values of column CATEGORY %s' %(seperatingLine, seperatingLine))
print(sorted(df_cat['CATEGORY'].unique()))
print('%s Unique values of column COMPANY, COMPANY_GROUP %s' %(seperatingLine, seperatingLine))
print(sorted(df_cat["COMPANY"].unique()))
print(sorted(df_cat["COMPANY_GROUP"].unique()))
df_cat['LABEL'] = df_cat.apply(getLabel, axis=1, nodeName='CATEGORY')
df_cat['PARENT_CATEGORY'] = df_cat.apply({'CATEGORY' : lambda x: x.split()[0].replace('/', '').replace('-', '_')})
df_cat['CATEGORY'] = df_cat.apply({'CATEGORY' : lambda x: x if (len(x.split())>1) else (str(x) + " BASE")})
df_cat['PARENT_LABEL'] = df_cat.apply(getLabel, axis=1, nodeName='PARENT_CATEGORY')
df_cat['PARENT_ID'] = df_cat.apply({'HAL_ID' : lambda x: x + 10000})
df_cat['REL_TYPE_CAT_PCAT'] = df_cat.apply(getRelationshipType, axis=1, types=["CATEGORY","PARENT_CATEGORY"])
print('%s Unique values of column CATEGORY %s' %(seperatingLine, seperatingLine))
print(sorted(df_cat['CATEGORY'].unique()))
print('%s Unique values of column PARENT_CATEGORY %s' %(seperatingLine, seperatingLine))
print(sorted(df_cat['PARENT_CATEGORY'].unique()))
print('%s Category informations extended by a parent category %s' %(seperatingLine, seperatingLine))
print(df_cat.iloc[:,:])
# Die Nodes für die Tarife
writeDsvFile( \
df_cat, 'categories', defaultCsvItemDelimiter, ['HAL_ID', 'CATEGORY', 'LABEL'], \
['categoryID:ID(CATEGORY-ID)','name', ':LABEL'])
# Die Nodes für die Haupttarife
writeDsvFile( \
df_cat, 'parent_categories', defaultCsvItemDelimiter, ['PARENT_ID', 'PARENT_CATEGORY', 'PARENT_LABEL'], \
['categoryID:ID(CATEGORY-ID)','name', ':LABEL'])
# Beziehung zu den Haupttarifen
writeDsvFile( \
df_cat, 'rel_cat_pcat', defaultCsvItemDelimiter, ['HAL_ID', 'PARENT_ID', 'REL_TYPE_CAT_PCAT'], \
[':START_ID(CATEGORY-ID)',':END_ID(CATEGORY-ID)', ':TYPE'])
return df_cat
In [89]:
df_cat = processCategoryInformations()
#df_cat.head()
Die Fahrzeuge haben sehr viele Eigenschaften. Nur einige davon eignen sich auch als Kategorisierungsmerkmal:
- VEHICLE_MODEL_TYPE -> Type
- VEHICLE_MANUFACTURER_NAME -> Marke
- FUEL_TYPE_NAME -> Kraftstoff
Andere Eigenschaften werden als Attribut aufgenommen.
Header für die Fahrzeuginformationen:
vehicleID:ID(VEHICLE-ID)|modelName|modelDetails|vin|registrationPlate|kw:long|fuelType|ownershipType|company|companyGroup|:LABEL
In [90]:
def processVehicleInformations():
df_fz = pd.read_csv('./datasets/OPENDATA_VEHICLE_CARSHARING.csv', quotechar='"',encoding ='utf-8', sep=';')
null_columns=df_fz.columns[df_fz.isnull().any()]
print('%s ID number range in vehicles %s' %(seperatingLine, seperatingLine))
print(df_fz.VEHICLE_HAL_ID.agg(['min', 'max']))
print('%s Column with null values in vehicle informations %s' %(seperatingLine, seperatingLine))
print(df_fz[null_columns].isnull().sum())
print('%s Unique values of column COMPANY, COMPANY_GROUP %s' %(seperatingLine, seperatingLine))
print(sorted(df_fz["COMPANY"].unique()))
print(sorted(df_fz["COMPANY_GROUP"].unique()))
print('%s Unique values of column VEHICLE_MANUFACTURER_NAME, VEHICLE_MODEL_TYPE and FUEL_TYPE_NAME %s' %(seperatingLine, seperatingLine))
print(sorted(df_fz['VEHICLE_MANUFACTURER_NAME'].unique()))
print(sorted(df_fz['VEHICLE_MODEL_TYPE'].unique()))
print(sorted(df_fz['FUEL_TYPE_NAME'].unique()))
print('%s Values with comma in column VEHICLE_TYPE_NAME %s' %(seperatingLine, seperatingLine))
print(len(list(filter( \
lambda x: "," in x \
, df_fz['VEHICLE_TYPE_NAME'].unique() \
))))
df_fz['VEHICLE_TYPE_NAME'] = df_fz.apply({'VEHICLE_TYPE_NAME' : lambda x: x.replace(',', '.')})
df_fz['LABEL'] = df_fz.apply(getLabel, axis=1, nodeName='VEHICLE')
print('%s vehicles dataframe informations %s' %(seperatingLine, seperatingLine))
print(df_fz.info(null_counts=True))
# Die Nodes für die Fahrzeuge
writeDsvFile( \
df_fz, 'vehicles', defaultCsvItemDelimiter, ['VEHICLE_HAL_ID', 'VEHICLE_MODEL_NAME', \
'VEHICLE_TYPE_NAME', 'VIN', 'REGISTRATION_PLATE', 'KW', 'FUEL_TYPE_NAME', \
'OWNERSHIP_TYPE', 'CAPACITY_AMOUNT', 'ACCESS_CONTROL_COMPONENT_TYPE', 'LABEL'], \
['vehicleID:ID(VEHICLE-ID)','modelName','modelDetails','vin','registrationPlate',\
'kw:long','fuelType','ownershipType','capacityAmount', 'bordComputerType', ':LABEL'])
return df_fz
In [91]:
df_fz = processVehicleInformations()
df_fz.head()
Out[91]:
In [92]:
import functools as ft
def processRentalZoneInformations():
#global seperatingLine
#global defaultCsvItemDelimiter
df_rz = pd.read_csv('./datasets/OPENDATA_RENTAL_ZONE_CARSHARING.csv', quotechar='"',encoding ='utf-8', sep=';')
print(df_rz.info(null_counts=True))
null_columns=df_rz.columns[df_rz.isnull().any()]
print('%s Null columns in dataframe rental zones %s' %(seperatingLine, seperatingLine))
print(df_rz[null_columns].isnull().sum())
print('%s ID number range in rental zones %s' %(seperatingLine, seperatingLine))
print(df_rz.RENTAL_ZONE_HAL_ID.agg(['min', 'max']))
print('%s Unique values of column COMPANY, COMPANY_GROUP %s' %(seperatingLine, seperatingLine))
print(sorted(df_rz["COMPANY"].unique()))
print(sorted(df_rz["COMPANY_GROUP"].unique()))
print('%s Unique values of columns RENTAL_ZONE_HAL_SRC and COUNTRY %s' %(seperatingLine, seperatingLine))
print(df_rz['RENTAL_ZONE_HAL_SRC'].unique())
print(df_rz['COUNTRY'].unique())
print(df_rz['ACTIVE_X'].unique())
# Gibt es ausschließlich nur Städte ohne Leerzeichen?
print('%s Are there only cities with one word? %s' %(seperatingLine, seperatingLine))
print(ft.reduce( \
lambda x, y: x & y , \
map( \
lambda x: len(x.split())==1 \
, df_rz['CITY'].unique()) \
))
# Welche Städte sind es mit dem Leerzeichen?
print('%s Wich cities have morde then one word? %s' %(seperatingLine, seperatingLine))
print(list(filter( \
lambda x: len(x.split())>1 \
, df_rz['CITY'].unique() \
)))
# Welche Rental-Zone Typen gibt es?
print('%s Unique values of column TYPE %s' %(seperatingLine, seperatingLine))
print(df_rz['TYPE'].unique())
# Datentypen in rental-zones
print('%s Datatypes of dataframe rental zones %s' %(seperatingLine, seperatingLine))
print(df_rz.dtypes)
print('%s Longtitude and Latitude content examples %s' %(seperatingLine, seperatingLine))
print(df_rz.iloc[:5, 7:9])
# Anpassen der Longititude Informationen
df_rz['LONGITUDE'] = df_rz.apply({'LONGITUDE' : lambda x: str(x).replace(',', '.')})
df_rz['LONGITUDE'] = df_rz['LONGITUDE'].astype(float)
# Erneute Ausgabe der Datentypen in rental-zones
print('%s Datatypes of dataframe rental zones %s' %(seperatingLine, seperatingLine))
print(df_rz.dtypes)
# Anpassen der Latitude Informationen
df_rz['LATITUDE'] = df_rz.apply({'LATITUDE' : lambda x: str(x).replace(',', '.')})
df_rz['LATITUDE'] = df_rz['LATITUDE'].astype(float)
# Erneute Ausgabe der Datentypen in rental-zones
print('%s Datatypes of dataframe rental zones %s' %(seperatingLine, seperatingLine))
print(df_rz.dtypes)
print('%s Longtitude and Latitude content examples after float conversion %s' %(seperatingLine, seperatingLine))
print(df_rz.iloc[:5, 7:9])
# Label-Informationen generieren
df_rz['LABEL'] = df_rz.apply(getLabel, axis=1, nodeName='RENTAL_ZONE')
print('%s Content of dataframe rental zones with label %s' %(seperatingLine, seperatingLine))
print(df_rz.head())
# Ausgeben der Import-Daten
# Die Nodes für die Stationen
writeDsvFile(df_rz, 'rentalZones', defaultCsvItemDelimiter, \
['RENTAL_ZONE_HAL_ID', 'NAME', 'CODE', 'TYPE', 'CITY', 'LATITUDE', \
'LONGITUDE', 'POI_AIRPORT_X', 'POI_LONG_DISTANCE_TRAINS_X', 'POI_SUBURBAN_TRAINS_X', \
'POI_UNDERGROUND_X', 'LABEL']\
, [ 'rentalZoneID:ID(RENTAL-ZONE-ID)', 'name', 'code', 'type', 'city', 'latitude:float', \
'longtitude:float', 'poiAirport', 'poiLongDistanceTrains', 'poiSuburbanTrains', \
'poiUnderground', ':LABEL'])
return df_rz
In [93]:
df_rz = processRentalZoneInformations()
df_rz.head()
Out[93]:
Möglicher Umgang mit den Informationen über die Quelle (Technical income channel) der Bestellung:
Als Node mit weiteren Informationen/Beziehungen:
- Es liegen Detail-Informationen über die Quelle vor und die Informationen sind fachlich in Bezug auf die Haltung der Datenrelevant. Notwendige Aufwände:
- Integrieren der Detail-Informationen
- ID-Namespace für die Nodes
- Klärung der notwendige Beziehungen
Als Label in einer Buchung:
- Es liegen keine Detail-Informationen über die Quelle vor. Notwendige Aufwände:
- Die Quellbenennungen Label-Fähig machen (Reduktion auf ein Wort)
- Die Labelnamen als zusätzliche Spalte in df_booking aufnehmen.
In [94]:
def processBookingInformations():
df_booking = pd.read_csv('./datasets/OPENDATA_BOOKING_CARSHARING.csv', quotechar='"',encoding ='utf-8', sep=';')
print('%s Compact info about dataframe bookings %s' %(seperatingLine, seperatingLine))
print(df_booking.info(null_counts=True))
print('%s Original content of dataframe bookings %s' %(seperatingLine, seperatingLine))
print(df_booking.head())
null_columns=df_booking.columns[df_booking.isnull().any()]
print('%s Null columns in dataframe bookings %s' %(seperatingLine, seperatingLine))
print(df_booking[null_columns].isnull().sum())
df_booking[['DISTANCE']] = df_booking[['DISTANCE']].fillna(value=0)
df_booking[['TECHNICAL_INCOME_CHANNEL']] = df_booking[['TECHNICAL_INCOME_CHANNEL']].fillna('UNDEFINED', axis=1)
print('%s Modified technical income channel informations about bookings %s' %(seperatingLine, seperatingLine))
print(sorted(df_booking.TECHNICAL_INCOME_CHANNEL.unique()))
df_booking['INCOME_CHANNEL_TYPE'] = df_booking.apply(getTechnicalIncomeChannelType, axis=1)
print('%s Booking informations extended by technical income channel type %s' %(seperatingLine, seperatingLine))
print(df_booking.iloc[:5, 7:])
df_booking['INCOME_CHANNEL_GROUP'] = df_booking.apply(getTechnicalIncomeChannelGroup, axis=1)
print('%s Booking informations extended by technical income channel group %s' %(seperatingLine, seperatingLine))
print(df_booking.iloc[:5, 7:])
print('%s Unique values in column INCOME_CHANNEL_GROUP %s' %(seperatingLine, seperatingLine))
print(df_booking.INCOME_CHANNEL_GROUP.unique())
print('%s Unique values in column INCOME_CHANNEL_TYPE %s' %(seperatingLine, seperatingLine))
print(df_booking.INCOME_CHANNEL_TYPE.unique())
print('%s Unique values in columns DISTANCE, COMPUTE_EXTRA_BOOKING_FEE and TRAVERSE_USE in bookings %s' %(seperatingLine, seperatingLine))
print(df_booking.DISTANCE.unique())
print(sorted(df_booking.COMPUTE_EXTRA_BOOKING_FEE.unique()))
print(sorted(df_booking.TRAVERSE_USE.unique()))
df_booking['TRAVERSE_USE'] = df_booking.apply({'TRAVERSE_USE' : lambda x: "true" if str(x).upper() == "JA" else "false"})
df_booking['COMPUTE_EXTRA_BOOKING_FEE'] = df_booking.apply({'COMPUTE_EXTRA_BOOKING_FEE' : lambda x: "true" if str(x).upper() == "JA" else "false"})
print('%s Modified boolean informations in bookings %s' %(seperatingLine, seperatingLine))
print(df_booking.iloc[:5, 7:])
df_booking['DISTANCE_CATEGORY'] = df_booking.apply(getDistanceCategory, axis=1)
print('%s Booking informations extended by a distance category %s' %(seperatingLine, seperatingLine))
print(df_booking.iloc[:5, 7:])
return df_booking
In [95]:
def processBookingRelationships(df_booking, df_fz, df_cat, df_rz):
df_lj_booking_vehicle = pd.merge(\
df_booking, \
df_fz.get(['VEHICLE_MANUFACTURER_NAME', 'VEHICLE_HAL_ID']).copy(True), \
on='VEHICLE_HAL_ID', how='left')
null_columns=df_lj_booking_vehicle.columns[df_lj_booking_vehicle.isnull().any()]
print('%s Null columns in joined dataframe bookings/vehicles %s' %(seperatingLine, seperatingLine))
print(df_lj_booking_vehicle[null_columns].isnull().sum())
print('%s Unique values in column VEHICLE_MANUFACTURER_NAME in vehicles %s' %(seperatingLine, seperatingLine))
print(sorted(df_fz['VEHICLE_MANUFACTURER_NAME'].unique()))
print('%s List of corrupt vehicle ids in bookings %s' %(seperatingLine, seperatingLine))
print(sorted(df_lj_booking_vehicle[df_lj_booking_vehicle["VEHICLE_MANUFACTURER_NAME"].isnull()] \
['VEHICLE_HAL_ID'].unique()))
nullObjectsInfo = str(len(df_lj_booking_vehicle[df_lj_booking_vehicle["VEHICLE_MANUFACTURER_NAME"].isnull()]))
print('%s number of bookings with missing vehicle reference: %s %s ' %(seperatingLine, nullObjectsInfo, seperatingLine))
nullObjectsInfo = str(len(df_lj_booking_vehicle[df_lj_booking_vehicle["VEHICLE_MANUFACTURER_NAME"].notnull()]))
print('%s number of bookings with correct vehicle reference: %s %s ' %(seperatingLine, nullObjectsInfo, seperatingLine))
df_lj_booking_vehicle['VEHICLE_MANUFACTURER_NAME'] = df_lj_booking_vehicle.apply( \
{'VEHICLE_MANUFACTURER_NAME' : lambda x: "UNKNOWN_VM" if str(x) == 'nan' else str(x).upper()})
df_lj_booking_vehicle.head()
print('%s Modified vehicle informations in bookings %s' %(seperatingLine, seperatingLine))
print(df_lj_booking_vehicle.iloc[:5, 3:])
df_lj_booking_vehicle['REL_TYPE_BOOKING_VEHICLE'] = df_lj_booking_vehicle.apply( \
getRelationshipType, axis=1, types=["BOOKING","VEHICLE"])
print('%s Vehicle informations extended by relation label %s' %(seperatingLine, seperatingLine))
print(df_lj_booking_vehicle.iloc[:5, 3:])
df_lj_booking_rz = pd.merge(\
pd.merge(\
df_lj_booking_vehicle, \
df_rz.get(['RENTAL_ZONE_HAL_ID']).copy(True), \
left_on='START_RENTAL_ZONE_HAL_ID', right_on='RENTAL_ZONE_HAL_ID', how='left'),\
df_rz.get(['RENTAL_ZONE_HAL_ID']).copy(True), \
left_on='END_RENTAL_ZONE_HAL_ID', right_on='RENTAL_ZONE_HAL_ID', how='left', \
suffixes=('_LEFT', '_RIGHT'))
null_columns=df_lj_booking_rz.columns[df_lj_booking_rz.isnull().any()]
print('%s Null columns in joined dataframe bookings/rental zones %s' %(seperatingLine, seperatingLine))
print(df_lj_booking_rz[null_columns].isnull().sum())
df_lj_booking_cat = pd.merge(\
df_lj_booking_rz, \
df_cat.get(['PARENT_CATEGORY', 'CATEGORY', 'HAL_ID']).copy(True), \
left_on='CATEGORY_HAL_ID', right_on='HAL_ID', how='left')
null_columns=df_lj_booking_cat.columns[df_lj_booking_cat.isnull().any()]
print('%s Null columns in joined dataframe bookings/categories %s' %(seperatingLine, seperatingLine))
print(df_lj_booking_cat[null_columns].isnull().sum())
df_lj_booking_cat['PARENT_CATEGORY'] = df_lj_booking_cat.apply( \
{'PARENT_CATEGORY' : lambda x: "UNKNOWN_PCAT" if str(x) == 'nan' else str(x).upper()})
df_lj_booking_cat['CATEGORY'] = df_lj_booking_cat.apply( \
{'CATEGORY' : lambda x: "UNKNOWN_CAT" if str(x) == 'nan' else str(x).upper()})
print('%s Modified category informations in joined dataframe bookings/categories %s' \
%(seperatingLine, seperatingLine))
print(df_lj_booking_cat[df_lj_booking_cat["PARENT_CATEGORY"] == 'UNKNOWN_PCAT'].iloc[:7, 10:])
parentCategoryIdList = sorted(df_lj_booking_cat[df_lj_booking_cat["PARENT_CATEGORY"] == 'UNKNOWN_PCAT']\
['CATEGORY_HAL_ID'].unique())
corruptCategoryInfo = str(len(parentCategoryIdList))
print('%s number of missing category ids: %s %s' %(seperatingLine, corruptCategoryInfo, seperatingLine))
categoryIdList = sorted(df_lj_booking_cat[df_lj_booking_cat["CATEGORY"] == 'UNKNOWN_CAT']\
['CATEGORY_HAL_ID'].unique())
corruptCategoryInfo = str(len(categoryIdList))
print('%s number of missing category ids: %s %s' %(seperatingLine, corruptCategoryInfo, seperatingLine))
corruptCategoryInfo = str(len(df_lj_booking_cat[df_lj_booking_cat["CATEGORY"] == 'UNKNOWN_CAT']))
print('%s number of bookings with missing (parent) category reference: %s %s' \
%(seperatingLine, corruptCategoryInfo, seperatingLine))
print('%s check of missing category ids:' %(seperatingLine))
for cid in categoryIdList:
print("id dont exists: " if df_cat[df_cat["HAL_ID"] == cid].empty else df_cat[df_cat["HAL_ID"] == cid], \
end=' ')
print(cid)
print(seperatingLine)
df_lj_booking_cat['REL_TYPE_BOOKING_CATEGORY'] = df_lj_booking_cat.apply(\
getRelationshipType, axis=1, types=["BOOKING", "CATEGORY"])
print('%s Booking informations extended by relationship-label booking/category %s' \
%(seperatingLine, seperatingLine))
print(df_lj_booking_cat.iloc[:5, 3:])
#df_lj_booking_cat['REL_TYPE_VEHICLE_START_RENTAL_ZONE'] = df_lj_booking_cat.apply( \
# getRelationshipType, axis=1, types=["VEHICLE", "START_RENTAL_ZONE"])
#print('%s Booking informations extended by relationship-label vehicle/start-rental-zone %s' \
# %(seperatingLine, seperatingLine))
#print(df_lj_booking_cat.iloc[:5, 3:])
print('%s compare both dataframes (original bookings and joined) %s' %(seperatingLine, seperatingLine))
print(df_booking.info(null_counts=True))
print(df_lj_booking_cat.info(null_counts=True))
# Zunächst müssen die null-Variablen herausgefilter werden, ...
# ... dann bekommen wir eine verlässliche Aussage über die Anzahl der Paare
df_gb_vrz = df_lj_booking_cat[ \
(df_lj_booking_cat.RENTAL_ZONE_HAL_ID_LEFT.notnull()) & \
(df_lj_booking_cat.RENTAL_ZONE_HAL_ID_RIGHT.notnull()) & \
(df_lj_booking_cat.VEHICLE_MANUFACTURER_NAME != "UNKNOWN_VM")] \
.get(['VEHICLE_HAL_ID', 'RENTAL_ZONE_HAL_ID_LEFT', 'RENTAL_ZONE_HAL_ID_RIGHT', \
'VEHICLE_MANUFACTURER_NAME']).copy(True) \
.groupby(['VEHICLE_HAL_ID', 'RENTAL_ZONE_HAL_ID_LEFT', 'RENTAL_ZONE_HAL_ID_RIGHT']).size() \
.reset_index().rename(columns={0:'count'})
df_gb_vrz['RENTAL_ZONE_HAL_ID_LEFT']=df_gb_vrz['RENTAL_ZONE_HAL_ID_LEFT'].astype(int)
df_gb_vrz['RENTAL_ZONE_HAL_ID_RIGHT']=df_gb_vrz['RENTAL_ZONE_HAL_ID_RIGHT'].astype(int)
print('%s vehicle/start-and end-rental-zone pairs with count %s' \
%(seperatingLine, seperatingLine))
print(df_gb_vrz)
print('%s number of vehicle/start-and end-rental-zone pairs with different start-/ and end-rental-zone %s' \
%(seperatingLine, seperatingLine))
print(df_gb_vrz[df_gb_vrz.RENTAL_ZONE_HAL_ID_LEFT != df_gb_vrz.RENTAL_ZONE_HAL_ID_RIGHT])
df_gb_vrz_counts = df_gb_vrz.groupby(['VEHICLE_HAL_ID']).size() \
.reset_index().rename(columns={0:'countOfRentalZones'})
print('%s count of different vehicle/rental-zone pairs per vehicle %s' \
%(seperatingLine, seperatingLine))
print(df_gb_vrz_counts.sort_values('countOfRentalZones'))
print('%s single rental-zone for a vehicle is the standard case %s' \
%(seperatingLine, seperatingLine))
print(df_gb_vrz_counts.countOfRentalZones.agg(['min', 'max', 'median']))
print('%s distribution of the diversity rate by vehicle/rental-zone pairs %s' \
%(seperatingLine, seperatingLine))
df_gb_vrz_diversity = df_gb_vrz_counts.groupby(['countOfRentalZones']).size() \
.reset_index().rename(columns={0:'occurrence'})
print(df_gb_vrz_diversity.sort_values('occurrence',ascending=False))
df_gb_vrz['REL_TYPE_VEHICLE_RENTAL_ZONE'] = df_gb_vrz.apply( \
getRelationshipType, axis=1, types=["VEHICLE", "RENTAL_ZONE"])
print('%s Booking informations extended by relationship-label vehicle/rental-zone %s' \
%(seperatingLine, seperatingLine))
print(df_gb_vrz.iloc[:5, 3:])
# Beziehung zu den Fahrzeugen
# Bei Beziehungen lassen wir die nicht existierenden Verweise logischerweise weg.
writeDsvFile(df_lj_booking_cat[df_lj_booking_cat["VEHICLE_MANUFACTURER_NAME"] != "UNKNOWN_VM"], \
'rel_booking_vehicle', defaultCsvItemDelimiter, \
['BOOKING_HAL_ID', 'VEHICLE_HAL_ID', 'REL_TYPE_BOOKING_VEHICLE'], \
[':START_ID(BOOKING-ID)',':END_ID(VEHICLE-ID)', ':TYPE'])
# Beziehung zu den Tarifklassen
# Bei Beziehungen lassen wir die nicht existierenden Verweise logischerweise weg.
writeDsvFile(df_lj_booking_cat[df_lj_booking_cat["CATEGORY"] != 'UNKNOWN_CAT'], \
'rel_booking_category', defaultCsvItemDelimiter, \
['BOOKING_HAL_ID', 'CATEGORY_HAL_ID', 'REL_TYPE_BOOKING_CATEGORY'],\
[':START_ID(BOOKING-ID)',':END_ID(CATEGORY-ID)', ':TYPE'])
# Beziehung Fahrzeug zu Start-Station
# Bei Beziehungen lassen wir die nicht existierenden Verweise logischerweise weg.
# Hier beide Entitäten: Rental-Zone und Vehicle
#writeDsvFile(df_lj_booking_cat[ \
# (df_lj_booking_cat.RENTAL_ZONE_HAL_ID_LEFT.notnull()) & \
# (df_lj_booking_cat.VEHICLE_MANUFACTURER_NAME != "UNKNOWN")], \
# 'rel_vehicle_start_rental_zone', defaultCsvItemDelimiter, \
# ['VEHICLE_HAL_ID', 'DATE_FROM', 'START_RENTAL_ZONE_HAL_ID', 'REL_TYPE_VEHICLE_START_RENTAL_ZONE'], \
# [':START_ID(VEHICLE-ID)', 'at', ':END_ID(RENTAL-ZONE-ID)', ':TYPE'])
# Beziehung Fahrzeug zu End-Station
# Bei Beziehungen lassen wir die nicht existierenden Verweise logischerweise weg.
# Hier beide Entitäten: Rental-Zone und Vehicle
writeDsvFile(df_gb_vrz, \
'rel_vehicle_rental_zone', defaultCsvItemDelimiter, \
['VEHICLE_HAL_ID', 'count', 'RENTAL_ZONE_HAL_ID_RIGHT', 'REL_TYPE_VEHICLE_RENTAL_ZONE'], \
[':START_ID(VEHICLE-ID)', 'times', ':END_ID(RENTAL-ZONE-ID)', ':TYPE'])
return df_lj_booking_cat
In [96]:
def processBookingLabelAndDsv(df_booking_final):
df_booking_final['LABEL'] = df_booking_final.apply(getLabel, axis=1, nodeName='BOOKING')
print('%s Content of dataframe bookings with label %s' %(seperatingLine, seperatingLine))
print(df_booking_final.iloc[:5, 10:])
df_booking_final = df_booking_final.drop(['INCOME_CHANNEL_TYPE', 'INCOME_CHANNEL_GROUP', \
'DISTANCE_CATEGORY', 'PARENT_CATEGORY', 'VEHICLE_MANUFACTURER_NAME'], axis=1)
print('%s Content of dataframe bookings only with attribute and label-information %s' \
%(seperatingLine, seperatingLine))
print(df_booking_final.iloc[:5, :])
# Die Nodes für die Buchungen
writeDsvFile(df_booking_final, 'bookings', ',', ['BOOKING_HAL_ID', 'DATE_BOOKING', \
'DATE_FROM', 'DATE_UNTIL', 'COMPUTE_EXTRA_BOOKING_FEE', \
'TRAVERSE_USE', 'DISTANCE', 'START_RENTAL_ZONE', 'END_RENTAL_ZONE', \
'CITY_RENTAL_ZONE', 'TECHNICAL_INCOME_CHANNEL', 'LABEL'], \
['bookingID:ID(BOOKING-ID)','bookingDate','startDate','endDate',\
'computeExtraBookingFee:boolean','traverseUse:boolean',\
'distance:float','startRentalZone','endRentalZone','cityRentalZone',\
'technicalIncomeChannel',':LABEL'])
In [97]:
df_booking = processBookingInformations()
In [98]:
df_booking_info_cols = df_booking.get(['BOOKING_HAL_ID', 'DATE_BOOKING', 'COMPUTE_EXTRA_BOOKING_FEE', \
'DATE_FROM', 'DATE_UNTIL', \
'TRAVERSE_USE', 'DISTANCE', 'START_RENTAL_ZONE', 'END_RENTAL_ZONE', \
'CITY_RENTAL_ZONE', 'TECHNICAL_INCOME_CHANNEL', 'INCOME_CHANNEL_TYPE', 'INCOME_CHANNEL_GROUP', \
'DISTANCE_CATEGORY']).copy(True)
df_booking_modified = df_booking.drop(['DATE_BOOKING', 'COMPUTE_EXTRA_BOOKING_FEE', \
'TRAVERSE_USE', 'DISTANCE', 'START_RENTAL_ZONE', 'END_RENTAL_ZONE', \
'CITY_RENTAL_ZONE', 'TECHNICAL_INCOME_CHANNEL', 'INCOME_CHANNEL_TYPE', 'INCOME_CHANNEL_GROUP', \
'DISTANCE_CATEGORY'], axis=1)
df_booking_modified = processBookingRelationships(df_booking_modified, df_fz, df_cat, df_rz)
In [99]:
df_final_booking_infos = pd.merge(\
df_booking_info_cols, \
df_booking_modified.get(['BOOKING_HAL_ID', 'PARENT_CATEGORY', \
'VEHICLE_MANUFACTURER_NAME']).copy(True), \
on='BOOKING_HAL_ID', how='left')
print('%s compare both dataframes (original bookings and joined) %s' %(seperatingLine, seperatingLine))
print(df_booking.info(null_counts=True))
print(df_final_booking_infos.info(null_counts=True))
processBookingLabelAndDsv(df_final_booking_infos)
In [100]:
#df_gb_vsrz = df_booking_modified \
# .get(['VEHICLE_HAL_ID', 'START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID']).copy(True) \
# .groupby(['VEHICLE_HAL_ID', 'START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID']).size() \
# .reset_index().rename(columns={0:'count'})
#df_gb_vsrz
#df_gb_vsrz.sort_values(1, axis=1)
#df_gb_vsrz.agg(['min', 'max'])
df_gb_vsrz = df_booking_modified[ \
(df_booking_modified.RENTAL_ZONE_HAL_ID_LEFT.notnull()) & \
(df_booking_modified.RENTAL_ZONE_HAL_ID_RIGHT.notnull()) & \
(df_booking_modified.VEHICLE_MANUFACTURER_NAME != "UNKNOWN_VM")] \
.get(['VEHICLE_HAL_ID', 'RENTAL_ZONE_HAL_ID_LEFT', 'RENTAL_ZONE_HAL_ID_RIGHT', \
'VEHICLE_MANUFACTURER_NAME']).copy(True) \
.groupby(['VEHICLE_HAL_ID', 'RENTAL_ZONE_HAL_ID_LEFT', 'RENTAL_ZONE_HAL_ID_RIGHT']).size() \
.reset_index().rename(columns={0:'count'})
df_gb_vsrz
Out[100]:
In [101]:
df_gb_vsrz[df_gb_vsrz.RENTAL_ZONE_HAL_ID_LEFT != df_gb_vsrz.RENTAL_ZONE_HAL_ID_RIGHT]
Out[101]:
In [102]:
df_gb_vsrz2 = df_gb_vsrz.groupby(['VEHICLE_HAL_ID']).size() \
.reset_index().rename(columns={0:'countOfRentalZones'})
df_gb_vsrz2.sort_values('countOfRentalZones')
Out[102]:
In [103]:
df_gb_vsrz2.countOfRentalZones.agg(['min', 'max', 'median'])
Out[103]:
In [104]:
df_gb_vsrz3 = df_gb_vsrz2.groupby(['countOfRentalZones']).size() \
.reset_index().rename(columns={0:'weightOfDiversity'})
df_gb_vsrz3.sort_values('weightOfDiversity',ascending=False)
Out[104]:
Log-Zeile in Neo4J:
InputRelationship:
source: /Users/ilker/Workspaces/jupyter/DB_OpenData_To_Neo4J/output/rel_booking_vehicle.dsv:547814
startNode: 46343998 (BOOKING-ID)
endNode: 172291 (VEHICLE-ID)
type: REFERS_TO
referring to missing node 46343998
Bedeutet so wie die Booking-ID 46343998 ist nicht verfügbar. Bei einer solchen Meldung muss also das fehlende nicht das referenzierte sein. Beide IDs sind aber in den Ursprungsdataframes verfügbar:
In [105]:
#list(filter( \
# lambda x: x==172291 \
# , df_fz['VEHICLE_HAL_ID'] \
#))
In [106]:
#list(filter( \
# lambda x: x==46343998 \
# , df_booking['BOOKING_HAL_ID'] \
#))
In [107]:
#list(filter( \
# lambda x: x==46343998 \
# , df_booking_labels['BOOKING_HAL_ID'] \
#))
Die Spalte VEHICLE_HAL_ID ist auch dann gefüllt, wenn es keine entsprechende Zeile in der DataFrame df_fz_labels gibt. Das führt dazu, dass über die andere Spalte VEHICLE_MANUFACTURER_NAME festgestellt werden muss, ob es "Nicht Übereinstimmungen" gab.
Die Filterung im Folgenden erbringt die Info, dass 1608 Buchungen auf ein nicht existierendes Fahrzeug zeigen:
Rustams-MBP:scripts ilker$ awk -f extractBadFindingsCompact.awk /Applications/neo4j-community-3.2.3/import.report > 20170827_badFindings.json
Rustams-MBP:scripts ilker$ ls -la
total 19344
drwxr-xr-x 5 ilker staff 170 27 Aug 21:03 .
drwxr-xr-x 8 ilker staff 272 27 Aug 19:23 ..
-rw-r--r-- 1 ilker staff 9895205 27 Aug 21:03 20170827_badFindings.json
-rwxrwxrwx@ 1 ilker staff 1631 27 Aug 21:02 extractBadFindingsCompact.awk
-rw-r--r-- 1 ilker staff 1835 27 Aug 19:25 loadDataToNeo4J.sh
Rustams-MBP:scripts ilker$ ls
20170827_badFindings.json extractBadFindingsCompact.awk loadDataToNeo4J.sh
Rustams-MBP:scripts ilker$ cp /Users/ilker/Downloads/extractBrokenTableNamespaces.awk .
Rustams-MBP:scripts ilker$ chmod 777 extractBrokenTableNamespaces.awk
Rustams-MBP:scripts ilker$ vim extractBrokenTableNamespaces.awk
Rustams-MBP:scripts ilker$ vim extractBadFindingsCompact.awk
Rustams-MBP:scripts ilker$ vim extractBadFindingsCompact.awk
Rustams-MBP:scripts ilker$ grep "Broken-Link" 20170827_badFindings.json | wc -l
73641
Rustams-MBP:scripts ilker$ vim extractBadFindingsCompact.awk
Rustams-MBP:scripts ilker$ grep "Duplicate-ID" 20170827_badFindings.json | wc -l
0
Rustams-MBP:scripts ilker$ vim extractBadFindingsCompact.awk
Rustams-MBP:scripts ilker$ grep "RENTAL-ZONE-ID" 20170827_badFindings.json | wc -l
66942
Rustams-MBP:scripts ilker$ grep "ID-NS" 20170827_badFindings.json | sort | uniq
, "ID-NS": "(RENTAL-ZONE-ID)"
, "ID-NS": "(VEHICLE-ID)"
Rustams-MBP:scripts ilker$ grep "VEHICLE-ID" 20170827_badFindings.json | wc -l
6699
Rustams-MBP:scripts ilker$
In [ ]: