In [20]:
import pandas as pd

cdr = pd.read_csv("./data/cdr_data.csv", dtype={0:str})
cdr.head()


Out[20]:
In Out Direction CallTimestamp Duration TowerID
0 04638472273 2666307251 Incoming 2010-12-25 07:16:24.736813 0:02:41.741499 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec
1 04638472273 1755442610 Incoming 2010-12-25 21:18:30.053710 0:02:47.108750 aeaf8b43-8034-44fe-833d-31854a75acbf
2 04638472273 5481755331 Incoming 2010-12-25 14:52:42.878016 0:04:35.356341 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
3 04638472273 1755442610 Incoming 2010-12-25 16:02:09.001913 0:02:23.498499 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
4 04638472273 2145623768 Incoming 2010-12-25 15:28:35.028554 0:03:54.692497 95d7920d-c3cd-4d20-a568-9a55800dc807

In [21]:
cdr_name = pd.DataFrame(cdr.In.unique(), columns=['number'])
cdr_name


Out[21]:
number
0 04638472273
1 01559410755
2 04931532174
3 02419930464
4 01884182865
5 03688089071
6 04555003213
7 02068627935
8 02894365987
9 08549533077

In [32]:
from faker import Faker



fake = Faker()

data = cdr_name
data['name'] = data.index.map(lambda x : fake.name())
#data['Number'] = data.index.map(lambda x : "0" + str(fake.zipcode() + fake.zipcode()))
data.loc[int(len(data)/2), ['name']] = "John Doe"
data.loc[int(len(data)/2), ['number']] = "4638472273"
#data['number'] = data.number.apply(lambda x : "0" + str(x))
assert 10 == len(data.name.value_counts()), "duplicate name"
#data = data.set_index('number')
data.to_excel("data/phoneowners.xlsx")

In [26]:
data[data['name'] == "John Doe"]


Out[26]:
name number
499 John Doe 04638472273

In [69]:
data.to_excel("data/phoneowners.xlsx", index=None)

In [86]:
cdr = pd.read_csv("data/CDR_original.csv", dtype={0:str,1:str})
cdr.head()


Out[86]:
In Out Direction CallDate CallTime DOW Duration TowerID TowerLat TowerLon
0 04638472273 02666307251 Incoming 2010-12-25 07:16:24.736813 Sat 0:02:41.741499 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 32.731611 -96.709417
1 04638472273 01755442610 Incoming 2010-12-25 21:18:30.053710 Sat 0:02:47.108750 aeaf8b43-8034-44fe-833d-31854a75acbf 32.731722 -96.709500
2 04638472273 05481755331 Incoming 2010-12-25 14:52:42.878016 Sat 0:04:35.356341 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 32.899944 -96.910389
3 04638472273 01755442610 Incoming 2010-12-25 16:02:09.001913 Sat 0:02:23.498499 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 32.899944 -96.910389
4 04638472273 02145623768 Incoming 2010-12-25 15:28:35.028554 Sat 0:03:54.692497 95d7920d-c3cd-4d20-a568-9a55800dc807 32.899944 -96.910389

In [87]:
towers = pd.read_csv("data/CDR_original.csv")[['TowerID', 'TowerLat', 'TowerLon']].drop_duplicates()
towers.to_csv("data/towers.csv", index=None, float_format="%6.6f")
towers.head()


Out[87]:
TowerID TowerLat TowerLon
0 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 32.731611 -96.709417
1 aeaf8b43-8034-44fe-833d-31854a75acbf 32.731722 -96.709500
2 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 32.899944 -96.910389
4 95d7920d-c3cd-4d20-a568-9a55800dc807 32.899944 -96.910389
5 95c91e8b-6ff1-4893-9df3-b0342636bd25 32.899944 -96.910389

In [88]:
towers = pd.read_csv("data/CDR_original.csv")[['TowerID', 'TowerLat', 'TowerLon']].drop_duplicates

In [89]:
cdr_data = cdr[['In', 'Out', 'Direction', 'CallDate', 'CallTime', 'Duration', 'TowerID']]
cdr_data['CallTimestamp'] = cdr.CallDate + " " + cdr.CallTime
del(cdr_data['CallDate'])
del(cdr_data['CallTime'])
cdr_data.head()


C:\dev\apps\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[89]:
In Out Direction Duration TowerID CallTimestamp
0 04638472273 02666307251 Incoming 0:02:41.741499 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 2010-12-25 07:16:24.736813
1 04638472273 01755442610 Incoming 0:02:47.108750 aeaf8b43-8034-44fe-833d-31854a75acbf 2010-12-25 21:18:30.053710
2 04638472273 05481755331 Incoming 0:04:35.356341 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 2010-12-25 14:52:42.878016
3 04638472273 01755442610 Incoming 0:02:23.498499 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 2010-12-25 16:02:09.001913
4 04638472273 02145623768 Incoming 0:03:54.692497 95d7920d-c3cd-4d20-a568-9a55800dc807 2010-12-25 15:28:35.028554

In [95]:
cdr_data.to_csv('data/cdr_data.csv', quoting=1, index=None, columns=['In', 'Out', 'Direction', 'CallTimestamp', 'Duration', 'TowerID'])
cdr_data.head()


Out[95]:
In Out Direction Duration TowerID CallTimestamp
0 04638472273 02666307251 Incoming 0:02:41.741499 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 2010-12-25 07:16:24.736813
1 04638472273 01755442610 Incoming 0:02:47.108750 aeaf8b43-8034-44fe-833d-31854a75acbf 2010-12-25 21:18:30.053710
2 04638472273 05481755331 Incoming 0:04:35.356341 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 2010-12-25 14:52:42.878016
3 04638472273 01755442610 Incoming 0:02:23.498499 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 2010-12-25 16:02:09.001913
4 04638472273 02145623768 Incoming 0:03:54.692497 95d7920d-c3cd-4d20-a568-9a55800dc807 2010-12-25 15:28:35.028554

In [92]:
cdr_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53188 entries, 0 to 53187
Data columns (total 6 columns):
In               53188 non-null object
Out              53188 non-null object
Direction        53188 non-null object
Duration         53188 non-null object
TowerID          53188 non-null object
CallTimestamp    53188 non-null object
dtypes: object(6)
memory usage: 2.4+ MB

In [ ]: