In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
In [2]:
data = []
for line in tqdm(open(r'./umn_foursquare_datasets/checkins.dat')):
line_data = line.split('|')
line_data = [token.strip() for token in line_data]
data.append(line_data)
1021967it [00:03, 300031.17it/s]
In [3]:
data_frame = pd.DataFrame(data)
data_frame.to_csv(r'./umn_foursquare_datasets/checkins.csv')
In [27]:
data_frame = pd.read_csv(r'./umn_foursquare_datasets/checkins.csv', header=0)
data_frame.head(10)
Out[27]:
id
user_id
venue_id
latitude
longitude
created_at
1
984301
2041916
5222
NaN
NaN
2012-04-21 17:39:01
2
984222
15824
5222
38.895112
-77.036366
2012-04-21 17:43:47
3
984315
1764391
5222
NaN
NaN
2012-04-21 17:37:18
4
984234
44652
5222
33.800745
-84.410520
2012-04-21 17:43:43
5
984249
2146840
5222
NaN
NaN
2012-04-21 17:42:58
6
984268
2146843
5222
NaN
NaN
2012-04-21 17:42:38
7
984281
2146846
5222
NaN
NaN
2012-04-21 17:39:40
8
984291
105054
5222
45.523452
-122.676207
2012-04-21 17:39:22
9
6651
1338710
219703
NaN
NaN
2011-12-08 23:11:23
10
984318
2146539
5222
40.764462
-111.904565
2012-04-21 17:35:46
In [28]:
data[:10]
Out[28]:
[['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at'],
['984301', '2041916', '5222', '', '', '2012-04-21 17:39:01'],
['984222',
'15824',
'5222',
'38.8951118',
'-77.0363658',
'2012-04-21 17:43:47'],
['984315', '1764391', '5222', '', '', '2012-04-21 17:37:18'],
['984234', '44652', '5222', '33.800745', '-84.41052', '2012-04-21 17:43:43'],
['984249', '2146840', '5222', '', '', '2012-04-21 17:42:58'],
['984268', '2146843', '5222', '', '', '2012-04-21 17:42:38'],
['984281', '2146846', '5222', '', '', '2012-04-21 17:39:40'],
['984291',
'105054',
'5222',
'45.5234515',
'-122.6762071',
'2012-04-21 17:39:22'],
['6651', '1338710', '219703', '', '', '2011-12-08 23:11:23']]
In [29]:
len(data_frame)
Out[29]:
1021966
In [30]:
data_frame
Out[30]:
id
user_id
venue_id
latitude
longitude
created_at
1
984301
2041916
5222
NaN
NaN
2012-04-21 17:39:01
2
984222
15824
5222
38.895112
-77.036366
2012-04-21 17:43:47
3
984315
1764391
5222
NaN
NaN
2012-04-21 17:37:18
4
984234
44652
5222
33.800745
-84.410520
2012-04-21 17:43:43
5
984249
2146840
5222
NaN
NaN
2012-04-21 17:42:58
6
984268
2146843
5222
NaN
NaN
2012-04-21 17:42:38
7
984281
2146846
5222
NaN
NaN
2012-04-21 17:39:40
8
984291
105054
5222
45.523452
-122.676207
2012-04-21 17:39:22
9
6651
1338710
219703
NaN
NaN
2011-12-08 23:11:23
10
984318
2146539
5222
40.764462
-111.904565
2012-04-21 17:35:46
11
984232
93870
380645
33.448377
-112.074037
2012-04-21 17:38:18
12
984483
1030290
955969
32.221743
-110.926479
2012-04-21 17:58:54
13
984685
304253
23558
40.650000
-73.950000
2012-04-21 18:19:34
14
984470
720850
749715
33.448377
-112.074037
2012-04-21 17:02:47
15
984500
54536
63452
NaN
NaN
2012-04-21 18:07:26
16
984610
1639666
442605
33.414768
-111.909309
2012-04-21 18:04:58
17
984722
1566751
23558
NaN
NaN
2012-04-21 18:14:26
18
10222
1340753
331466
NaN
NaN
2011-12-09 00:55:22
19
984653
1647192
23558
42.358431
-71.059773
2012-04-21 18:23:22
20
984251
298547
77014
33.448377
-112.074037
2012-04-21 17:34:33
21
984528
2046311
15682
33.414768
-111.909309
2012-04-21 18:18:29
22
984736
2146942
23558
NaN
NaN
2012-04-21 18:04:53
23
984545
2070173
15682
NaN
NaN
2012-04-21 16:36:56
24
984237
2146838
18553
NaN
NaN
2012-04-21 14:40:30
25
984257
423903
66777
NaN
NaN
2012-04-21 17:59:32
26
984438
2096701
4432
33.448377
-112.074037
2012-04-21 18:21:19
27
984277
1648816
18006
33.248664
-111.634299
2012-04-21 17:49:30
28
984320
349414
819
32.840678
-117.258794
2012-04-21 17:27:36
29
984343
2146861
819
NaN
NaN
2012-04-21 17:26:26
30
984359
1861538
819
34.052234
-118.243685
2012-04-21 17:18:22
...
...
...
...
...
...
...
1021937
949275
872108
610705
33.103174
-96.670550
2012-04-13 00:55:03
1021938
949643
1903801
16093
NaN
NaN
2012-04-12 23:26:03
1021939
949650
1794344
83070
NaN
NaN
2012-04-13 01:14:48
1021940
949841
281860
9822
28.538335
-81.379237
2012-04-13 01:32:08
1021941
951412
919527
112490
47.606209
-122.332071
2012-04-13 01:29:20
1021942
951656
1358854
60
NaN
NaN
2012-04-13 03:20:09
1021943
951707
2137641
112006
NaN
NaN
2012-04-13 03:24:42
1021944
952256
1028835
8595
42.129224
-80.085059
2012-04-13 02:42:32
1021945
952750
962762
60
NaN
NaN
2012-04-13 04:33:23
1021946
953075
1834102
46717
NaN
NaN
2012-04-13 03:12:07
1021947
953225
2137860
490458
NaN
NaN
2012-04-13 02:56:48
1021948
953584
174305
916378
28.320007
-80.607551
2012-04-13 05:04:27
1021949
953690
1367118
4202
NaN
NaN
2012-04-13 03:55:50
1021950
953695
146164
4202
30.267153
-97.743061
2012-04-13 03:33:53
1021951
953907
1903379
61002
NaN
NaN
2012-04-13 02:56:59
1021952
953968
2137969
515136
NaN
NaN
2012-04-13 05:01:20
1021953
954162
285893
42311
33.448377
-112.074037
2012-04-13 03:28:53
1021954
954283
709109
684720
33.448377
-112.074037
2012-04-13 06:26:18
1021955
954361
81625
5222
37.629349
-122.400087
2012-04-13 06:27:38
1021956
954428
626076
950644
40.850100
-73.866246
2012-04-13 06:29:25
1021957
954536
1903801
60
NaN
NaN
2012-04-13 06:47:49
1021958
954925
674797
7491
33.748995
-84.387982
2012-04-13 05:26:29
1021959
955280
1903801
44209
NaN
NaN
2012-04-13 08:16:55
1021960
955561
626076
20073
40.850100
-73.866246
2012-04-13 09:56:48
1021961
955892
674797
2297
33.748995
-84.387982
2012-04-13 10:56:03
1021962
956377
845102
11195
42.765366
-71.467566
2012-04-13 12:08:45
1021963
956119
1139114
29488
42.439479
-83.743830
2012-04-13 11:36:44
1021964
956447
2088020
4432
NaN
NaN
2012-04-13 12:58:05
1021965
956733
960666
60
42.331427
-83.045754
2012-04-13 21:56:19
1021966
957139
1771518
10935
NaN
NaN
2012-04-14 02:44:52
1021966 rows × 6 columns
In [31]:
data_frame.dropna(inplace=True)
data_frame
Out[31]:
id
user_id
venue_id
latitude
longitude
created_at
2
984222
15824
5222
38.895112
-77.036366
2012-04-21 17:43:47
4
984234
44652
5222
33.800745
-84.410520
2012-04-21 17:43:43
8
984291
105054
5222
45.523452
-122.676207
2012-04-21 17:39:22
10
984318
2146539
5222
40.764462
-111.904565
2012-04-21 17:35:46
11
984232
93870
380645
33.448377
-112.074037
2012-04-21 17:38:18
12
984483
1030290
955969
32.221743
-110.926479
2012-04-21 17:58:54
13
984685
304253
23558
40.650000
-73.950000
2012-04-21 18:19:34
14
984470
720850
749715
33.448377
-112.074037
2012-04-21 17:02:47
16
984610
1639666
442605
33.414768
-111.909309
2012-04-21 18:04:58
19
984653
1647192
23558
42.358431
-71.059773
2012-04-21 18:23:22
20
984251
298547
77014
33.448377
-112.074037
2012-04-21 17:34:33
21
984528
2046311
15682
33.414768
-111.909309
2012-04-21 18:18:29
26
984438
2096701
4432
33.448377
-112.074037
2012-04-21 18:21:19
27
984277
1648816
18006
33.248664
-111.634299
2012-04-21 17:49:30
28
984320
349414
819
32.840678
-117.258794
2012-04-21 17:27:36
30
984359
1861538
819
34.052234
-118.243685
2012-04-21 17:18:22
35
984294
1447044
1135957
33.414768
-111.909309
2012-04-21 17:33:12
36
984336
2064557
111071
33.448377
-112.074037
2012-04-21 16:25:00
39
984527
1243334
989443
34.483901
-114.322455
2012-04-21 18:22:59
41
984326
2046338
989443
34.048928
-111.093731
2012-04-21 17:56:27
42
984454
2098254
610703
32.715329
-117.157255
2012-04-21 17:49:30
43
984395
1508584
410624
33.448377
-112.074037
2012-04-21 17:58:10
47
984231
1868751
21948
38.895112
-77.036366
2012-04-21 15:53:38
48
984322
1096457
10935
33.745851
-117.826166
2012-04-21 18:10:08
50
984474
2146888
10935
34.096111
-118.105833
2012-04-21 17:39:56
51
984355
966186
10935
34.052234
-118.243685
2012-04-21 18:03:47
53
984345
482032
10935
37.629349
-122.400087
2012-04-21 18:04:38
54
984508
2146895
10935
4.598056
-74.075833
2012-04-21 17:38:25
55
984381
28655
10935
33.804167
-118.158056
2012-04-21 17:59:40
56
984346
2098173
11195
40.735657
-74.172367
2012-04-21 18:17:31
...
...
...
...
...
...
...
1021886
936189
610763
7491
41.499495
-81.695409
2012-04-11 05:46:31
1021888
936836
610763
64
41.499495
-81.695409
2012-04-11 11:19:55
1021891
937236
287341
9310
30.267153
-97.743061
2012-04-11 13:04:51
1021896
938393
379311
11138
39.739154
-104.984703
2012-04-11 17:12:13
1021897
938627
503977
6562
28.538335
-81.379237
2012-04-11 18:09:54
1021899
938806
287341
12004
30.267153
-97.743061
2012-04-11 17:17:49
1021902
940180
1028835
16436
42.129224
-80.085059
2012-04-11 21:58:20
1021903
940219
373497
7489
40.715972
-74.001437
2012-04-11 22:14:12
1021913
943071
404031
1095987
33.448377
-112.074037
2012-04-12 09:25:48
1021916
943920
706143
4432
35.960638
-83.920739
2012-04-12 16:53:29
1021917
944155
872108
54467
33.103174
-96.670550
2012-04-12 19:06:15
1021919
944731
502383
23558
41.650111
-70.241131
2012-04-12 20:34:27
1021924
946195
354912
29137
39.867891
-75.131565
2012-04-12 20:34:29
1021926
946253
281860
29488
28.538335
-81.379237
2012-04-12 21:50:52
1021937
949275
872108
610705
33.103174
-96.670550
2012-04-13 00:55:03
1021940
949841
281860
9822
28.538335
-81.379237
2012-04-13 01:32:08
1021941
951412
919527
112490
47.606209
-122.332071
2012-04-13 01:29:20
1021944
952256
1028835
8595
42.129224
-80.085059
2012-04-13 02:42:32
1021948
953584
174305
916378
28.320007
-80.607551
2012-04-13 05:04:27
1021950
953695
146164
4202
30.267153
-97.743061
2012-04-13 03:33:53
1021953
954162
285893
42311
33.448377
-112.074037
2012-04-13 03:28:53
1021954
954283
709109
684720
33.448377
-112.074037
2012-04-13 06:26:18
1021955
954361
81625
5222
37.629349
-122.400087
2012-04-13 06:27:38
1021956
954428
626076
950644
40.850100
-73.866246
2012-04-13 06:29:25
1021958
954925
674797
7491
33.748995
-84.387982
2012-04-13 05:26:29
1021960
955561
626076
20073
40.850100
-73.866246
2012-04-13 09:56:48
1021961
955892
674797
2297
33.748995
-84.387982
2012-04-13 10:56:03
1021962
956377
845102
11195
42.765366
-71.467566
2012-04-13 12:08:45
1021963
956119
1139114
29488
42.439479
-83.743830
2012-04-13 11:36:44
1021965
956733
960666
60
42.331427
-83.045754
2012-04-13 21:56:19
396634 rows × 6 columns
In [32]:
data_frame.drop(['created_at', 'id', 'user_id', 'venue_id'], axis=1, inplace=True)
In [33]:
from sklearn.cluster import MeanShift
clusterAlg = MeanShift(bandwidth=0.1)
In [34]:
clasters = clusterAlg.fit(data_frame[:100000])
In [49]:
clasters = clusterAlg.predict(data_frame[:100000])
clasters
Out[49]:
array([ 5, 7, 30, ..., 25, 19, 4])
In [50]:
cluster_elem_numb = dict()
for cluster in clasters:
if cluster not in cluster_elem_numb:
cluster_elem_numb[cluster] = 1
else:
cluster_elem_numb[cluster] += 1
In [51]:
offices = [
(33.751277, -118.188740),
(25.867736, -80.324116),
(51.503016, -0.075479),
(52.378894, 4.885084),
(39.366487, 117.036146),
(-33.868457, 151.205134),
]
In [48]:
clusterAlg.cluster_centers_
Out[48]:
array([[ 40.7177164 , -73.99183542],
[ 33.44943805, -112.00213969],
[ 33.44638027, -111.90188756],
...,
[ 38.891565 , -121.2930079 ],
[ 42.5953378 , -78.9411461 ],
[ 41.5822716 , -85.8344383 ]])
In [56]:
best_coor = None
best_dist = np.inf
for cluster_idx in tqdm(range(len(clusterAlg.cluster_centers_))):
if cluster_elem_numb[cluster_idx] < 15:
continue
curr_coor = clusterAlg.cluster_centers_[cluster_idx]
for coordinate in offices:
curr_dist = ((curr_coor[0] - coordinate[0]) ** 2 +
(curr_coor[1] - coordinate[1]) ** 2) ** 0.5
if curr_dist < best_dist:
best_dist = curr_dist
best_coor = curr_coor
100%|██████████| 3230/3230 [00:00<00:00, 468321.42it/s]
In [57]:
best_coor[0], best_coor[1]
Out[57]:
(-33.860630428571433, 151.20477592857145)
In [ ]:
Content source: ALEXKIRNAS/DataScience
Similar notebooks: