In [1]:
from pyspark.sql.functions import col
In [2]:
df = sqlContext.read.parquet("/guoda/data/idigbio-20190612T171757.parquet")
In [4]:
(df
.filter(col("country").like("%korea%"))
.groupby(col("country"))
.count()
.orderBy(col("count"), ascending=False)
.toPandas()
)
Out[4]:
country
count
0
south korea
15305
1
korea
10171
2
north korea
2241
3
korea, republic of
826
4
north korea/south korea
463
5
korea democratic republic of
399
6
korea [not specified]
260
7
korea (republic of)
164
8
korea, democratic people's republic of
151
9
north korea / south korea
105
10
korea, democratic peoples republic
101
11
s.korea
92
12
korea, north
84
13
republic of korea
46
14
korea, south
33
15
korea republic of
32
16
coree [korea]
26
17
korea, democratic people’s republic of
18
18
[korea]
15
19
zuid-korea
12
20
korea (democratic people's republic of)
10
21
north north korea / south korea
10
22
n. coree [korea]
6
23
korea septentrionalis
6
24
south korea / north korea
6
25
coree (korea)
5
26
north or south korea
4
27
korea (s)
3
28
korea [defunct]
3
29
s. korea
2
30
corie du nord [north korea]
2
31
"korea"
2
32
n. korea
2
33
japan / korea
2
34
sud korea
2
35
s korea
2
36
coree du nord [north korea]
2
37
korea (north)
2
38
cent. korea
1
39
korea, democratic peoples republic of
1
40
probably korea
1
41
korea?
1
42
south korea; yellow sea
1
43
corea bor. [korea]
1
44
corea [korea]
1
45
nord coree [korea]
1
46
north korea; south korea
1
47
the republic of korea
1
48
s.e.korea
1
49
korea [obsolete]
1
50
southêkorea
1
51
korea sptentrionalis
1
52
japan / south korea
1
53
korea, china
1
54
korea, democratic people's republic of
1
55
republic of korea
1
56
south korean
1
57
south north korea / south korea
1
Content source: bio-guoda/guoda-examples
Similar notebooks: