ProgrammingAssigmentWeek_01



In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
data = []
for line in tqdm(open(r'./umn_foursquare_datasets/checkins.dat')):
    line_data = line.split('|')
    line_data = [token.strip() for token in line_data]
    data.append(line_data)


1021967it [00:03, 300031.17it/s]

In [3]:
data_frame = pd.DataFrame(data)
data_frame.to_csv(r'./umn_foursquare_datasets/checkins.csv')

In [27]:
data_frame = pd.read_csv(r'./umn_foursquare_datasets/checkins.csv', header=0)
data_frame.head(10)


Out[27]:
id user_id venue_id latitude longitude created_at
1 984301 2041916 5222 NaN NaN 2012-04-21 17:39:01
2 984222 15824 5222 38.895112 -77.036366 2012-04-21 17:43:47
3 984315 1764391 5222 NaN NaN 2012-04-21 17:37:18
4 984234 44652 5222 33.800745 -84.410520 2012-04-21 17:43:43
5 984249 2146840 5222 NaN NaN 2012-04-21 17:42:58
6 984268 2146843 5222 NaN NaN 2012-04-21 17:42:38
7 984281 2146846 5222 NaN NaN 2012-04-21 17:39:40
8 984291 105054 5222 45.523452 -122.676207 2012-04-21 17:39:22
9 6651 1338710 219703 NaN NaN 2011-12-08 23:11:23
10 984318 2146539 5222 40.764462 -111.904565 2012-04-21 17:35:46

In [28]:
data[:10]


Out[28]:
[['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at'],
 ['984301', '2041916', '5222', '', '', '2012-04-21 17:39:01'],
 ['984222',
  '15824',
  '5222',
  '38.8951118',
  '-77.0363658',
  '2012-04-21 17:43:47'],
 ['984315', '1764391', '5222', '', '', '2012-04-21 17:37:18'],
 ['984234', '44652', '5222', '33.800745', '-84.41052', '2012-04-21 17:43:43'],
 ['984249', '2146840', '5222', '', '', '2012-04-21 17:42:58'],
 ['984268', '2146843', '5222', '', '', '2012-04-21 17:42:38'],
 ['984281', '2146846', '5222', '', '', '2012-04-21 17:39:40'],
 ['984291',
  '105054',
  '5222',
  '45.5234515',
  '-122.6762071',
  '2012-04-21 17:39:22'],
 ['6651', '1338710', '219703', '', '', '2011-12-08 23:11:23']]

In [29]:
len(data_frame)


Out[29]:
1021966

In [30]:
data_frame


Out[30]:
id user_id venue_id latitude longitude created_at
1 984301 2041916 5222 NaN NaN 2012-04-21 17:39:01
2 984222 15824 5222 38.895112 -77.036366 2012-04-21 17:43:47
3 984315 1764391 5222 NaN NaN 2012-04-21 17:37:18
4 984234 44652 5222 33.800745 -84.410520 2012-04-21 17:43:43
5 984249 2146840 5222 NaN NaN 2012-04-21 17:42:58
6 984268 2146843 5222 NaN NaN 2012-04-21 17:42:38
7 984281 2146846 5222 NaN NaN 2012-04-21 17:39:40
8 984291 105054 5222 45.523452 -122.676207 2012-04-21 17:39:22
9 6651 1338710 219703 NaN NaN 2011-12-08 23:11:23
10 984318 2146539 5222 40.764462 -111.904565 2012-04-21 17:35:46
11 984232 93870 380645 33.448377 -112.074037 2012-04-21 17:38:18
12 984483 1030290 955969 32.221743 -110.926479 2012-04-21 17:58:54
13 984685 304253 23558 40.650000 -73.950000 2012-04-21 18:19:34
14 984470 720850 749715 33.448377 -112.074037 2012-04-21 17:02:47
15 984500 54536 63452 NaN NaN 2012-04-21 18:07:26
16 984610 1639666 442605 33.414768 -111.909309 2012-04-21 18:04:58
17 984722 1566751 23558 NaN NaN 2012-04-21 18:14:26
18 10222 1340753 331466 NaN NaN 2011-12-09 00:55:22
19 984653 1647192 23558 42.358431 -71.059773 2012-04-21 18:23:22
20 984251 298547 77014 33.448377 -112.074037 2012-04-21 17:34:33
21 984528 2046311 15682 33.414768 -111.909309 2012-04-21 18:18:29
22 984736 2146942 23558 NaN NaN 2012-04-21 18:04:53
23 984545 2070173 15682 NaN NaN 2012-04-21 16:36:56
24 984237 2146838 18553 NaN NaN 2012-04-21 14:40:30
25 984257 423903 66777 NaN NaN 2012-04-21 17:59:32
26 984438 2096701 4432 33.448377 -112.074037 2012-04-21 18:21:19
27 984277 1648816 18006 33.248664 -111.634299 2012-04-21 17:49:30
28 984320 349414 819 32.840678 -117.258794 2012-04-21 17:27:36
29 984343 2146861 819 NaN NaN 2012-04-21 17:26:26
30 984359 1861538 819 34.052234 -118.243685 2012-04-21 17:18:22
... ... ... ... ... ... ...
1021937 949275 872108 610705 33.103174 -96.670550 2012-04-13 00:55:03
1021938 949643 1903801 16093 NaN NaN 2012-04-12 23:26:03
1021939 949650 1794344 83070 NaN NaN 2012-04-13 01:14:48
1021940 949841 281860 9822 28.538335 -81.379237 2012-04-13 01:32:08
1021941 951412 919527 112490 47.606209 -122.332071 2012-04-13 01:29:20
1021942 951656 1358854 60 NaN NaN 2012-04-13 03:20:09
1021943 951707 2137641 112006 NaN NaN 2012-04-13 03:24:42
1021944 952256 1028835 8595 42.129224 -80.085059 2012-04-13 02:42:32
1021945 952750 962762 60 NaN NaN 2012-04-13 04:33:23
1021946 953075 1834102 46717 NaN NaN 2012-04-13 03:12:07
1021947 953225 2137860 490458 NaN NaN 2012-04-13 02:56:48
1021948 953584 174305 916378 28.320007 -80.607551 2012-04-13 05:04:27
1021949 953690 1367118 4202 NaN NaN 2012-04-13 03:55:50
1021950 953695 146164 4202 30.267153 -97.743061 2012-04-13 03:33:53
1021951 953907 1903379 61002 NaN NaN 2012-04-13 02:56:59
1021952 953968 2137969 515136 NaN NaN 2012-04-13 05:01:20
1021953 954162 285893 42311 33.448377 -112.074037 2012-04-13 03:28:53
1021954 954283 709109 684720 33.448377 -112.074037 2012-04-13 06:26:18
1021955 954361 81625 5222 37.629349 -122.400087 2012-04-13 06:27:38
1021956 954428 626076 950644 40.850100 -73.866246 2012-04-13 06:29:25
1021957 954536 1903801 60 NaN NaN 2012-04-13 06:47:49
1021958 954925 674797 7491 33.748995 -84.387982 2012-04-13 05:26:29
1021959 955280 1903801 44209 NaN NaN 2012-04-13 08:16:55
1021960 955561 626076 20073 40.850100 -73.866246 2012-04-13 09:56:48
1021961 955892 674797 2297 33.748995 -84.387982 2012-04-13 10:56:03
1021962 956377 845102 11195 42.765366 -71.467566 2012-04-13 12:08:45
1021963 956119 1139114 29488 42.439479 -83.743830 2012-04-13 11:36:44
1021964 956447 2088020 4432 NaN NaN 2012-04-13 12:58:05
1021965 956733 960666 60 42.331427 -83.045754 2012-04-13 21:56:19
1021966 957139 1771518 10935 NaN NaN 2012-04-14 02:44:52

1021966 rows × 6 columns


In [31]:
data_frame.dropna(inplace=True)
data_frame


Out[31]:
id user_id venue_id latitude longitude created_at
2 984222 15824 5222 38.895112 -77.036366 2012-04-21 17:43:47
4 984234 44652 5222 33.800745 -84.410520 2012-04-21 17:43:43
8 984291 105054 5222 45.523452 -122.676207 2012-04-21 17:39:22
10 984318 2146539 5222 40.764462 -111.904565 2012-04-21 17:35:46
11 984232 93870 380645 33.448377 -112.074037 2012-04-21 17:38:18
12 984483 1030290 955969 32.221743 -110.926479 2012-04-21 17:58:54
13 984685 304253 23558 40.650000 -73.950000 2012-04-21 18:19:34
14 984470 720850 749715 33.448377 -112.074037 2012-04-21 17:02:47
16 984610 1639666 442605 33.414768 -111.909309 2012-04-21 18:04:58
19 984653 1647192 23558 42.358431 -71.059773 2012-04-21 18:23:22
20 984251 298547 77014 33.448377 -112.074037 2012-04-21 17:34:33
21 984528 2046311 15682 33.414768 -111.909309 2012-04-21 18:18:29
26 984438 2096701 4432 33.448377 -112.074037 2012-04-21 18:21:19
27 984277 1648816 18006 33.248664 -111.634299 2012-04-21 17:49:30
28 984320 349414 819 32.840678 -117.258794 2012-04-21 17:27:36
30 984359 1861538 819 34.052234 -118.243685 2012-04-21 17:18:22
35 984294 1447044 1135957 33.414768 -111.909309 2012-04-21 17:33:12
36 984336 2064557 111071 33.448377 -112.074037 2012-04-21 16:25:00
39 984527 1243334 989443 34.483901 -114.322455 2012-04-21 18:22:59
41 984326 2046338 989443 34.048928 -111.093731 2012-04-21 17:56:27
42 984454 2098254 610703 32.715329 -117.157255 2012-04-21 17:49:30
43 984395 1508584 410624 33.448377 -112.074037 2012-04-21 17:58:10
47 984231 1868751 21948 38.895112 -77.036366 2012-04-21 15:53:38
48 984322 1096457 10935 33.745851 -117.826166 2012-04-21 18:10:08
50 984474 2146888 10935 34.096111 -118.105833 2012-04-21 17:39:56
51 984355 966186 10935 34.052234 -118.243685 2012-04-21 18:03:47
53 984345 482032 10935 37.629349 -122.400087 2012-04-21 18:04:38
54 984508 2146895 10935 4.598056 -74.075833 2012-04-21 17:38:25
55 984381 28655 10935 33.804167 -118.158056 2012-04-21 17:59:40
56 984346 2098173 11195 40.735657 -74.172367 2012-04-21 18:17:31
... ... ... ... ... ... ...
1021886 936189 610763 7491 41.499495 -81.695409 2012-04-11 05:46:31
1021888 936836 610763 64 41.499495 -81.695409 2012-04-11 11:19:55
1021891 937236 287341 9310 30.267153 -97.743061 2012-04-11 13:04:51
1021896 938393 379311 11138 39.739154 -104.984703 2012-04-11 17:12:13
1021897 938627 503977 6562 28.538335 -81.379237 2012-04-11 18:09:54
1021899 938806 287341 12004 30.267153 -97.743061 2012-04-11 17:17:49
1021902 940180 1028835 16436 42.129224 -80.085059 2012-04-11 21:58:20
1021903 940219 373497 7489 40.715972 -74.001437 2012-04-11 22:14:12
1021913 943071 404031 1095987 33.448377 -112.074037 2012-04-12 09:25:48
1021916 943920 706143 4432 35.960638 -83.920739 2012-04-12 16:53:29
1021917 944155 872108 54467 33.103174 -96.670550 2012-04-12 19:06:15
1021919 944731 502383 23558 41.650111 -70.241131 2012-04-12 20:34:27
1021924 946195 354912 29137 39.867891 -75.131565 2012-04-12 20:34:29
1021926 946253 281860 29488 28.538335 -81.379237 2012-04-12 21:50:52
1021937 949275 872108 610705 33.103174 -96.670550 2012-04-13 00:55:03
1021940 949841 281860 9822 28.538335 -81.379237 2012-04-13 01:32:08
1021941 951412 919527 112490 47.606209 -122.332071 2012-04-13 01:29:20
1021944 952256 1028835 8595 42.129224 -80.085059 2012-04-13 02:42:32
1021948 953584 174305 916378 28.320007 -80.607551 2012-04-13 05:04:27
1021950 953695 146164 4202 30.267153 -97.743061 2012-04-13 03:33:53
1021953 954162 285893 42311 33.448377 -112.074037 2012-04-13 03:28:53
1021954 954283 709109 684720 33.448377 -112.074037 2012-04-13 06:26:18
1021955 954361 81625 5222 37.629349 -122.400087 2012-04-13 06:27:38
1021956 954428 626076 950644 40.850100 -73.866246 2012-04-13 06:29:25
1021958 954925 674797 7491 33.748995 -84.387982 2012-04-13 05:26:29
1021960 955561 626076 20073 40.850100 -73.866246 2012-04-13 09:56:48
1021961 955892 674797 2297 33.748995 -84.387982 2012-04-13 10:56:03
1021962 956377 845102 11195 42.765366 -71.467566 2012-04-13 12:08:45
1021963 956119 1139114 29488 42.439479 -83.743830 2012-04-13 11:36:44
1021965 956733 960666 60 42.331427 -83.045754 2012-04-13 21:56:19

396634 rows × 6 columns


In [32]:
data_frame.drop(['created_at', 'id', 'user_id', 'venue_id'], axis=1, inplace=True)

In [33]:
from sklearn.cluster import MeanShift
clusterAlg = MeanShift(bandwidth=0.1)

In [34]:
clasters = clusterAlg.fit(data_frame[:100000])

In [49]:
clasters = clusterAlg.predict(data_frame[:100000])
clasters


Out[49]:
array([ 5,  7, 30, ..., 25, 19,  4])

In [50]:
cluster_elem_numb = dict()
for cluster in clasters:
    if cluster not in cluster_elem_numb:
        cluster_elem_numb[cluster] = 1
    else:
        cluster_elem_numb[cluster] += 1

In [51]:
offices = [
    (33.751277, -118.188740),
    (25.867736, -80.324116),
    (51.503016, -0.075479),
    (52.378894, 4.885084),
    (39.366487, 117.036146),
    (-33.868457, 151.205134),
]

In [48]:
clusterAlg.cluster_centers_


Out[48]:
array([[  40.7177164 ,  -73.99183542],
       [  33.44943805, -112.00213969],
       [  33.44638027, -111.90188756],
       ..., 
       [  38.891565  , -121.2930079 ],
       [  42.5953378 ,  -78.9411461 ],
       [  41.5822716 ,  -85.8344383 ]])

In [56]:
best_coor = None
best_dist = np.inf

for cluster_idx in tqdm(range(len(clusterAlg.cluster_centers_))):
    if cluster_elem_numb[cluster_idx] < 15:
        continue
    
    curr_coor = clusterAlg.cluster_centers_[cluster_idx]
    for coordinate in offices:
        curr_dist = ((curr_coor[0] - coordinate[0]) ** 2 + 
                     (curr_coor[1] - coordinate[1]) ** 2) ** 0.5
        
        if curr_dist < best_dist:
            best_dist = curr_dist
            best_coor = curr_coor


100%|██████████| 3230/3230 [00:00<00:00, 468321.42it/s]

In [57]:
best_coor[0], best_coor[1]


Out[57]:
(-33.860630428571433, 151.20477592857145)

In [ ]: