In [1]:
import db
from IPython.core.display import display, HTML
import logic
import matplotlib.pyplot as plt
import pandas as pd
from numpy.random import normal
from wordcloud import WordCloud
import nltk
from pprint import pprint
%matplotlib nbagg
This Jupyter Notebook belongs to a data mining project takign place at UEL. The aim of this research is to monitor a selection of dark web resources to gain insights regarding the type of products and services that are sold, and to possibly identify any significant trends.
On the summer of 2015, a web crawling script was run over a dark web hosted e-commerce platform, collecting records of products being sold on the site. The purpose of this notebook is to analyse the data collected.
Figures below describe the schema of the given database
In [2]:
db.pandas_query('SHOW tables;')
Out[2]:
In [3]:
db.pandas_query('DESCRIBE tblProduct
;')
Out[3]:
In [4]:
db.pandas_query('''
SELECT min(time_stamp) as 'Start Date', max(time_stamp) as 'End Date' FROM tblProduct;
''')
Out[4]:
Within this time, the following number of records have been collected:
In [5]:
db.pandas_query('''
SELECT count(*) as 'Total Product Records' FROM tblProduct;
''')
Out[5]:
In [33]:
dataframe = db.pandas_query('''
SELECT B.subcategory_name, count(A.product_name) as 'Total Products'
FROM tblProduct A
LEFT JOIN tblSubCategory B
ON A.subCategory_id = B.subCategory_id
GROUP BY B.subcategory_name
ORDER BY -count(A.product_name)
limit 30;
''',
index_col='subcategory_name')
dataframe.plot(kind='barh', title='Most Popular Categories')
Out[33]:
From the above query we find that 20 out the 30 most common subcategories are drug related. The other main area of research we are interested on is the illegal market of passports and ids. While there seems not to be any trace of those sub categories, we yet have to analyse the "Other" subcategory.
Let's find out what percentage of these listings do actually fall in the drugs category.
In [7]:
comparison_df = db.pandas_query('''
SELECT count(*) as 'drug_listings' FROM tblProduct
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Weight loss')
);
''')
comparison_df['other'] = dataframe.sum()[0] - comparison_df.iloc[0][0]
print (comparison_df)
# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=(10,5))
bar_width = 1
# positions of the left bar-boundaries
bar_l = [i for i in range(len(comparison_df['drug_listings']))]
# positions of the x-axis ticks (center of the bars as bar labels)
tick_pos = [i+(bar_width/2) for i in bar_l]
# Create the total
totals = [i+j for i,j in zip(comparison_df['drug_listings'], comparison_df['other'])]
# Create the percentage
pre_rel = [i / j * 100 for i,j in zip(comparison_df['drug_listings'], totals)]
mid_rel = [i / j * 100 for i,j in zip(comparison_df['other'], totals)]
# Create a bar chart in position bar_1
ax.bar(bar_l,
pre_rel,
label='Drug Listings',
alpha=0.9,
color='#019600',
width=bar_width,
edgecolor='white'
)
ax.bar(bar_l,
mid_rel,
bottom=pre_rel,
label='Other',
alpha=0.9,
color='#3C5F5A',
width=bar_width,
edgecolor='white'
)
# Set the ticks to be first names
ax.set_ylabel("Percentage")
ax.set_xlabel("")
ax.get_xaxis().set_ticks([])
ax.legend()
# Let the borders of the graphic
plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width])
plt.ylim(-10, 110)
# shot plot
plt.show()
In [8]:
countries_df = db.pandas_query('''
SELECT count(*) as 'total_products', product_origin FROM tblProduct
WHERE product_origin != ''
GROUP BY product_origin
ORDER BY total_products DESC;
''', index_col='product_origin')
countries_df.plot(kind='barh', title='Total Productes lsted by country of origin')
Out[8]:
In [9]:
countries_df = db.pandas_query('''
SELECT count(*) as 'total_drug_listings', product_origin FROM tblProduct
WHERE product_origin != ''
and subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Weight loss')
)
GROUP BY product_origin
ORDER BY total_drug_listings DESC
LIMIT 10;
''', index_col='product_origin')
countries_ax = countries_df.plot(kind='barh')
for p in countries_ax.patches:
countries_ax.annotate(
"%.2f" % p.get_width(), (p.get_x() + p.get_width(),
p.get_y()), xytext=(5, 10), textcoords='offset points')
Given the chinese scenario, lets dive a little bit deeper and see what kind of products are attracting more attention in the Chinese Market.
In [37]:
china_df = db.pandas_query('''
SELECT B.subcategory_name, count(A.product_name) as 'Total Products'
FROM tblProduct A
LEFT JOIN tblSubCategory B
ON A.subCategory_id = B.subCategory_id
WHERE product_origin = 'China'
GROUP BY B.subcategory_name
ORDER BY -count(A.product_name)
limit 30;
''',
index_col='subcategory_name')
china_df.plot(kind='barh', title='Most Popular Categories')
Out[37]:
In [11]:
base_drugs_query = '''
SELECT count(*) as 'total_listed', product_origin FROM tblProduct
WHERE subCategory_id in (SELECT subCategory_id FROM tblSubCategory
WHERE subCategory_name = '{}')
AND product_origin != ''
GROUP BY product_origin
ORDER BY total_listed DESC
LIMIT 3;
'''
drugs_list = ['Cocaine', 'Weed', 'MDMA', 'LSD', 'Heroin', 'Steroids', 'Hash']
drug_data = {}
ax = None
for drug in drugs_list:
df = db.pandas_query(base_drugs_query.format(drug), index_col='product_origin')
df.plot.barh(title='{} top providing countries'.format(drug))
In [12]:
dayonday_df = db.pandas_query('''
SELECT count(*) as total_products, time_stamp FROM tblProduct
GROUP BY time_stamp;
''', index_col='time_stamp')
dayonday_drugs_df = db.pandas_query('''
SELECT count(*) as drugs, time_stamp FROM tblProduct
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Weight loss')
)
GROUP BY time_stamp;
''', index_col='time_stamp')
dayonday_df2 = pd.concat([dayonday_df, dayonday_drugs_df], axis='time_stamp', join='inner')
dayonday_df2.plot()
Out[12]:
The currency used in this dark web hosted commerce platform is the Bitcoin. The dataset does contain the bitcoin price at which each listing has been placed, however, Bitcoin exchange rates are very volatile. Therefore, it would be meaningless to analyse the dataset in terms of its bitcoin value. Instead, each of the Bitcoin values have been converted to both GBP and USD as of the market exchange rate as of the day the product was listed on the website.
Our listings product quantites are not suitable for data analysis, as they are cotained within title and description fields which are just composed of unstructured text blocks. In order to extract those quantities we can either manipulate the string fields with regular expressions or use Natural Language proccessing to extract the required values using semantic and grammar machine learning algorithms. There is a powerfull NLP library called NLTK that we will use in our attempt to extract those values. Let's look at the folliwng two title examples extracted from the 'Cocaine' subcategory dataset:
"6 Grams Pure Cocaine (SAFE,ZERO CUTS & PURE+FREE COKE DAILY)"
"SALE!! 28G Cocaine - 92% Purity -Platinum Quality - No Cuts FE LISTING"We start by tokenizing the sentence into words and analyze the word types grammatically
In [16]:
string = "6 Grams Pure Cocaine (SAFE,ZERO CUTS & PURE+FREE COKE DAILY)"
tokenized = nltk.word_tokenize(string)
tagged = nltk.pos_tag(tokenized)
print (tagged)
In [15]:
named_entities = nltk.ne_chunk(tagged)
named_entities
#named_entities.draw()
Out[15]:
As expected, the unsupervised learning algorithm wasn't very accurate and has classified the words 'Grams Pure Cocaine' as a persons name, given that is composed of 3 capitalized words.
Let's now move into the second example.
In [17]:
string = "SALE!! 28G Cocaine - 92% Purity -Platinum Quality - No Cuts FE LISTING"
tokenized = nltk.word_tokenize(string)
tagged2 = nltk.pos_tag(tokenized)
tagged2
Out[17]:
In this particular case our grammar learning has diverted from the previous example as the word tokenizing algorithm has returned the word 28G rather than "28" and "G" as separate words. As the problem has happened at the word tokenization process, let's add a regular expression check to make sure CD and unit are returned separately:
In [20]:
tokenizer_regex = r'(?u)\d+(?:\.\,\d+)?|\w+|%'
tokenized = nltk.regexp_tokenize(string, tokenizer_regex)
tagged3 = nltk.pos_tag(tokenized)
tagged3
Out[20]:
The regular expression based tokenizer has worked better, as it does now effectively split the unit from it's numerical value. The unit though is being classified as a proper noun (NNP) instead of a nount (NN) as the word is capitalized. Finally, we can "chunk" the POS tags using a POS regex match to get those pairs
In [21]:
chunk_exp = r'''amount: {<CD><NN|NNP|NNS|NNPS>}'''
chunk_parser = nltk.RegexpParser(chunk_exp)
chunked = chunk_parser.parse(tagged3)
chunked
for i in chunked.subtrees():
leaves = i.leaves()
if leaves[1][0] == '%':
i._label = 'purity'
chunked
Out[21]:
In [10]:
chunked.draw()
We want now to tune and refine the algorithm to fit our dataset. This code must be available to be used whenever we need to data mine values with a similar structure to the cocaine subset.
The following pair of functions will work as generators that an be applied to a series object:
In [22]:
def amount_series_generator(chunk, unit):
"""
function that yields measurment units from a given chunk
"""
valid_units = logic.VALID_UNITS.get(unit)
for subtree in chunk.subtrees(filter=lambda t: t.label() == 'amount'):
leaves = subtree.leaves()
if leaves[1][0].lower() in valid_units:
return float(leaves[0][0])
#added this line as I found a few listings selling KILOS of cocaine!
elif leaves[1][0].lower() in ['kg', 'kilo', 'kilogram', 'kgram']:
return float(leaves[0][0]) * 1000
#another hack for oz
elif leaves[1][0].lower() in ['oz', 'ounce', 'ounces']:
if logic.NUMBER_STRING.get(leaves[0][0].lower()):
return logic.NUMBER_STRING.get(leaves[0][0].lower()) * 28.35
return float(leaves[0][0]) * 28.35
return None
def unit_series_generator(chunk, unit):
"""
function that yields measurement units from a given chunk
"""
valid_units = logic.VALID_UNITS.get(unit)
if unit == 'g':
valid_units + ['kg', 'kilo', 'kilogram', 'kgram', 'oz', 'ounce', 'ounces']
for subtree in chunk.subtrees(filter=lambda t: t.label() == 'amount'):
leaves = subtree.leaves()
if leaves[1][0].lower() in valid_units:
return unit
return None
The data we want to extract is contained within the product_name field. Then, we need to process the query and then apply the previously described algorithms subsequently
In [24]:
tokenizer_regex = r'(?u)\d+(?:\.\d+)?|\w+|%'
chunk_exp = r'''amount: {<CD><NN|NNP|NNS|NNPS|JJ>}'''
chunk_parser = nltk.RegexpParser(chunk_exp)
cocaine_listings = db.get_products_by_subcategory('Cocaine')
cocaine_listings_names = cocaine_listings['product_name']
cocaine_words = cocaine_listings_names.apply(nltk.regexp_tokenize, args=(tokenizer_regex,))
cocaine_tagged = cocaine_words.apply(nltk.pos_tag)
cocaine_chunked = cocaine_tagged.apply(chunk_parser.parse)
cocaine_quants = cocaine_chunked.apply(amount_series_generator, args='g')
cocaine_units = cocaine_chunked.apply(unit_series_generator, args='g')
cocaine_listings['cocaine_amount'] = cocaine_quants
cocaine_listings['cocaine_units'] = cocaine_units
cocaine_listings[['product_id', 'product_name','GBP', 'cocaine_amount', 'cocaine_units']]
Out[24]:
The overview shows that the algorithms have performed as epected, but we still see some empty values still, mainly due to the lack of information on the product name.
In order to understand the success rate of the algorithms we can do:
In [25]:
failed = cocaine_units.isnull().sum()
print ('{} out of {} units obtained obtained'.format(len(cocaine_units) - failed, len(cocaine_units)))
print ('{}% success rate'.format(100-(failed/len(cocaine_units)*100)))
As we had just a 12% failure, we can handle the Null values normalizing the distribution with the help of the standard deviation.
In [26]:
gbp_grams_df = cocaine_listings[['GBP', 'cocaine_amount']]
#remove the NaN rows to calculate mean
gbp_grams_df = gbp_grams_df[pd.notnull(gbp_grams_df['cocaine_amount'])]
gbp_grams_df['price_per_gram'] = gbp_grams_df['GBP'] / gbp_grams_df['cocaine_amount']
mean = cocaine_mean_price = gbp_grams_df['price_per_gram'].mean()
mean
Out[26]:
And now we can use the value to fill the missing cells in our main dataframe:
In [27]:
cocaine_listings['cocaine_amount'] = cocaine_listings.cocaine_amount.fillna(cocaine_listings['GBP'] / mean)
cocaine_listings['cocaine_amount'] = cocaine_listings['cocaine_amount'].round(2)
cocaine_listings['cocaine_units'] = cocaine_listings.cocaine_units.fillna('g')
cocaine_listings['price_per_gram'] = cocaine_listings['GBP'] / cocaine_listings['cocaine_amount']
cocaine_listings
Out[27]:
After all this data wranglig we can effectively calculate the amount of cocaine being sold on the site as well as further analyse the data set by price and grams.
In [40]:
print ('{} grams on sale for {} GBP'.format(
cocaine_listings['cocaine_amount'].sum(),
round(cocaine_listings['GBP'].sum(), 2)))
print ('Average price: {} GBP per gram'.format(cocaine_listings['price_per_gram'].mean()))
print('Median price: {} GBP per gram'.format(cocaine_listings['price_per_gram'].median()))
#cocaine_listings['price_per_gram'].hist()
cocaine_plot_df = cocaine_listings[cocaine_listings['price_per_gram'] <1000 ]
#cocaine_plot_df['price_per_gram'].hist(x)
import matplotlib.pyplot as plt
plt.hist(cocaine_plot_df['price_per_gram'], bins=10)
plt.title("Price per gram of cocaine Histogram")
plt.xlabel("Price in GBP")
plt.ylabel("listed")
plt.show()
And now we can group our results by country of origin:
In [43]:
cocaine_plot_df.groupby('product_origin')['price_per_gram'].mean().sort_values().plot(
kind='barh',
title='Average price of a gram of cocaine by country of origin')
Out[43]:
In [44]:
cocaine_plot_df.groupby('product_origin')['cocaine_amount'].sum().sort_values().plot(
kind='barh',
title='grams of cocaine in sale by country of origin')
Out[44]:
While dealing with the gaps some outliers where generated, with prices per gram orders of magnitude higher than the mean. After analyzing that sample, we discovered that some of the listings where offereing KILOGRAMS of cocaine! After tunning the NLP algorithm to also understand kilograms (and ounces) the distribution was normalized again. The following code exposes a subset of some of those suprising listings (offering more than 500g):
In [45]:
cocaine_listings[[
'product_id', 'product_name', 'vendor_id' ,
'product_origin', 'GBP',
'cocaine_amount', 'cocaine_units']
][cocaine_listings['cocaine_amount'] > 500]
Out[45]:
HIGHLIGHTS
With the bitcoin values added to both GBP and USD, we can rank each of the drugs in total value, which gives a better picture of the size of the market than just the activity measured by the number of products listed in the site.
In [46]:
print ('Total Market Value: {} USD'.format(db.query('''SELECT SUM(USD) from tblProduct''').fetchone()[0]))
In [47]:
all_products_df = db.pandas_query('''
SELECT product_id, GBP, B.subCategory_name FROM tblProduct
JOIN tblSubCategory B
ON B.subCategory_id = tblProduct.subCategory_id;
''')
all_products_df.groupby('subCategory_name')['GBP'].sum()\
.sort_values(ascending=False)\
.head(15)\
.plot(kind='barh', title = 'Top 15 categories market value in GBP',
x='Total GBP', y='Category')
Out[47]:
Up until now we have looked at the category as one single group in order to build a general picture of the market in terms of its size, its value, grographic distributions and time series.
In addition, we have also dug deeper onto the drug markets in a number of ways and described in much more detail one of the most popular areas of this particular market: The cocaine market
In this section we are going to focus instead on the different fake and illegal documents that are available for sale in this website. More recently, this topic has been covered on the media, relating to how different interantional terrorist groups have gained access to illegal documents that allow them to travel through ultiple countries using this fake documents.
We have seen records of passports on the database but we are not sure under which categories are they being labeled. Let's have a look:
In [48]:
passport_categories = db.pandas_query('''
SELECT count(A.subCategory_id) as total_passports, B.subCategory_name FROM tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subcategory_id
WHERE A.product_name LIKE '%passport%'
GROUP BY B.subcategory_name
ORDER BY total_passports DESC;
''')
passport_categories
Out[48]:
Unfortunately there isn't a large dataset of passport items but still worth making an exploratory analysis.
After manually revising the databse entries I have learned that those passports under the only two categories containning fake passports are the following:
In [49]:
passport_dataframe = db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE A.product_name LIKE '%passport%'
AND B.subCategory_name = 'Physical documents'
ORDER BY GBP DESC;
''')
passport_dataframe
Out[49]:
In [50]:
passport_dataframe = db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE A.product_name LIKE '%passport%'
AND B.subCategory_name = 'Scans/photos'
ORDER BY GBP DESC;
''')
passport_dataframe
Out[50]:
HIGHLIGHTS
more interenstingly, the vendors for this type of documents seem to be very concentrated, which will prove usefull when we do our analysis on the vendors dataset
### 3.2.2 Fake IDs
When manually inspecting the 'Physical documents' dataset I noticed a pattern to classify fake ids:
- Fake IDs: They refer to fake US ID cards, offers include different states within the US
In [51]:
db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE A.product_name LIKE '%fake id%'
AND B.subCategory_name = 'Physical documents'
ORDER BY GBP DESC;
''')
Out[51]:
In [52]:
db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE A.product_name LIKE '%national id%'
AND B.subCategory_name = 'Physical documents'
ORDER BY GBP DESC;
''')
Out[52]:
HIGHLIGHTS
In [53]:
id_description = db.query('''
SELECT A.product_description
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE A.product_name LIKE '%National id%'
AND B.subCategory_name = 'Physical documents'
ORDER BY GBP DESC;
LIMIT 1
''').fetchone()[0]
display(HTML(id_description))
In [54]:
db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE B.subCategory_name = 'Physical documents'
AND (A.product_name LIKE '%drivers license%'
OR A.product_name LIKE '%driving licence%'
OR A.product_name LIKE '%drivers licence%')
ORDER BY GBP DESC;
''')
Out[54]:
HIGHLIGHTS
In [56]:
display(HTML(db.query('''
SELECT product_description FROM tblProduct WHERE product_id = 12541;''').fetchone()[0]))
In [4]:
count_of_documents = db.query('''
SELECT count(*)
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE B.subCategory_name = 'Physical documents';
''').fetchone()[0]
print ('{} total document listings'.format(count_of_documents))
For the purpose of analysing the reminder of the document listings, and with the idea of applying this to other categories, the following code builds on the algorithms previously build to do the Natural Language processing, this time focusing on extracting the nouns:
In [5]:
def nlp_noun_extractor(series):
'''Given a Series object of strings, applies a tagging
algorithm to obtain the POS value of each word in the
Series, returning a new series with the broken down strings'''
tagged_series = series_tagger(series)
noun_series = tagged_series.apply(noun_tag_extractor)
words = [word for l in noun_series for word in l if word.isalpha()]
words_freqs = {}
for word in words:
if words_freqs.get(word):
words_freqs[word] += 1
else :
words_freqs[word] = 1
return words_freqs
def noun_tag_extractor(tag_list):
'''Given a list of words and POS tags, returns a list
containing just the NN tags'''
return [word for word,pos in tag_list if pos.startswith('NN')]
def series_tagger(series):
'''Given a Series object S of type strings, returns a Series of
all the tokenized strings from S'''
tokenized_series = series.apply(nltk.word_tokenize)
tagged_series = tokenized_series.apply(nltk.pos_tag)
return tagged_series
In [6]:
fake_docs = db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE B.subCategory_name = 'Physical documents'
AND (A.product_name LIKE '%drivers license%'
OR A.product_name LIKE '%driving licence%'
OR A.product_name LIKE '%drivers licence%'
OR A.product_name LIKE '%passport%'
OR A.product_name LIKE '%fake id%'
OR A.product_name LIKE '%resident id%'
OR A.product_name LIKE '%passport%')
ORDER BY GBP DESC;
''')
other_docs = db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, A.product_description, B.subCategory_name, A.vendor_id
FROM dark_web.tblProduct A
JOIN tblSubCategory B ON B.subCategory_id = A.subCategory_id
WHERE B.subCategory_name = 'Physical documents'
AND product_id not in {};
'''.format(tuple(i for i in fake_docs['product_id'])))
Now we can build a nice word cloud from all those words
In [7]:
words_freqs = nlp_noun_extractor(other_docs['product_name'])
wordcloud = WordCloud().generate_from_frequencies(words_freqs.items())
plt.imshow(wordcloud)
plt.axis("off");
In [61]:
all_product_names = db.pandas_query('''
SELECT product_name from tblProduct;
''')
wf = nlp_noun_extractor(all_product_names['product_name'])
In [62]:
wc = WordCloud().generate_from_frequencies(wf.items())
plt.imshow(wc)
plt.axis('off')
Out[62]:
In [5]:
drugs_ids = db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.vendor_id FROM tblProduct A
JOIN tblSubCategory B
ON B.subCategory_id = A.subCategory_id
WHERE A.subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Scans/photos', 'Physical documents')
);
''')
In [6]:
other_categories = db.pandas_query('''
SELECT A.product_id, A.product_name, A.GBP, B.subCategory_name, A.product_origin, A.vendor_id FROM tblProduct A
JOIN tblSubCategory B
ON B.subCategory_id = A.subCategory_id
WHERE A.subCategory_id NOT IN
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Scans/photos', 'Physical documents')
);
'''.format(tuple(set(cat for cat in drugs_ids['subCategory_name']))))
In [7]:
other_categories_df = other_categories[['subCategory_name', 'GBP']]
other_categories\
.groupby('subCategory_name')['GBP']\
.sum()\
.sort_values(ascending=False)\
.plot(kind='barh', title='Other markets by GBP value')
Out[7]:
In [8]:
other_categories\
.groupby('subCategory_name')['GBP']\
.mean()\
.sort_values(ascending=False)\
.plot(kind='barh', title='Other markets average GBP value')
Out[8]:
In [9]:
db.pandas_query('''
SELECT * FROM dark_web.tblProduct
WHERE subCategory_id in
(select subCategory_id from tblSubCategory
where subCategory_name = 'Software')
ORDER BY GBP;
''')
Out[9]:
In [71]:
software_listings_df = db.get_products_by_subcategory('Software')
freqs = nlp_noun_extractor(software_listings_df['product_name'])
wc = WordCloud().generate_from_frequencies(freqs.items())
plt.imshow(wc)
plt.axis("off");
And finally we can use the same algorithm to better understand the category labeled as other th happens to be the mode on the total listings distribution by category
In [72]:
other_listings_df = db.get_products_by_subcategory('Other')
freqs = nlp_noun_extractor(other_listings_df['product_name'])
wc = WordCloud().generate_from_frequencies(freqs.items())
plt.imshow(wc)
plt.axis("off");
This final section covers objective 4 of this project
To gain insight on how agents operate in these markets with appropiate classification and association algorithms
The web scraped datasets contain information about the sup one single website from just the supply side. Therefore our records contain details on the vendors only, and no data has been obtained from the possible buyers.
From the given data, we aim to gain insight on the size of the vendors operating on the site. Example questions include: "How concentrated is the supply within the different vendors? ARe there any existing patterns that would manifest the supply is operated by well-coordinated organizations instead of individuals? How do this patterns manifest within the different categories (focusing on drugs and fake documents distribution)? Do these groups of vendors operate locally or are they rather internationally coordinated, providing products from different locations?
The vendors dataset can be obtained by the following query:
In [10]:
vendors_df = db.pandas_query('''
SELECT * FROM tblVendor;
''')
vendors_df
Out[10]:
The first pattern classification will be to extract the total value of the listings for each of the vendors. With this information we hope to understand how concentrated the supply is. For this purpose, it is required to add an additional column to our dataframe containing the total GBP value of each vendors listings:
In [11]:
vendors_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
GROUP BY B.vendor_id;
''', index_col='vendor_id')
vendors_df['GBP_value'].sort_values(ascending=False).head(25).plot(kind='barh')
Out[11]:
In [12]:
import matplotlib.pyplot as plt
plt.hist(vendors_df['GBP_value'], bins=20)
plt.title("Vendors value Histogram")
plt.xlabel("Vendor Value in GBP")
plt.ylabel("Frequency")
plt.show()
The Histogram proves a great disparity between the vendors. The distribution is completely left-skewed, which could potentially mean that while the majoriy of the vendors total products value is below 4 figures, there is a smaller number of vendors whose total value is orders of magnitude higher, with some of them responsible for listings valued over 6 figures.
In order to prove our hypthesis it is required to break the distribution into percentiles and compare the new distribution with the median and the mean
In [84]:
quant_df = pd.DataFrame({'mean': vendors_df.mean(), 'median': vendors_df.median(),
'25%': vendors_df.quantile(0.25), '50%': vendors_df.quantile(0.5),
'75%': vendors_df.quantile(0.90)})
quant_df.plot(kind='bar')
Out[84]:
In [85]:
vendors_df.quantile(.9)
Out[85]:
In [87]:
vendors_df.std()
Out[87]:
As we can see the 75tth percentile is 5 times higher than the 50th, while the mean is 10 times higher than the median, confirmed by a standard deviation of 203.969. These numbers denote how the highest percentiles are responsible to a much larger percentage of the market than all the other vendors combined. Now we will try to understand to what extent, as well as finding what is the sitatuitiont within the main categories object of this analysis: Drugs and Fake documents:
In [88]:
drug_vendors_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Weight loss')
)
GROUP BY B.vendor_id;
''', index_col='vendor_id')
coke_vendors_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('Cocaine')
)
GROUP BY B.vendor_id;
''', index_col='vendor_id')
docs_vendors_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE B.subCategory_id in
(SELECT subCategory_id FROM tblSubCategory where subCategory_name = 'Physical documents')
AND (B.product_name LIKE '%drivers license%'
OR B.product_name LIKE '%driving licence%'
OR B.product_name LIKE '%drivers licence%'
OR B.product_name LIKE '%passport%'
OR B.product_name LIKE '%fake id%'
OR B.product_name LIKE '%resident id%'
OR B.product_name LIKE '%passport%')
GROUP BY B.vendor_id
''', index_col='vendor_id')
In [89]:
quantiles = [(x*10, x/10) for x in range(0,10)]
#quantiles = [(1,.25),(2,.75)]
quantiles_sum = pd.DataFrame()
quantiles_sum['total_gbp'] = pd.Series(
[drug_vendors_df[drug_vendors_df.GBP_value > drug_vendors_df.GBP_value.quantile(q[1])]['GBP_value'].sum() for q in quantiles],
[q[0] for q in quantiles]
)
sorted_vendors = vendors_df.sort_values('GBP_value')
sorted_drug_vendors = drug_vendors_df.sort_values('GBP_value')
sorted_docs_vendors = docs_vendors_df.sort_values('GBP_value')
sorted_coke_vendors = coke_vendors_df.sort_values('GBP_value')
In [132]:
largest_value = sorted_vendors.tail((len(sorted_vendors))*10//100)['GBP_value'].sum()
total_value = sorted_vendors['GBP_value'].sum()
print ('10% largest vendors market value {}'.format(largest_value))
print ('Market total value: {} GBP'.format(total_value))
print ('###############################')
largest_drug_value = sorted_drug_vendors.tail((len(sorted_drug_vendors))*10//100)['GBP_value'].sum()
total_drug_value = sorted_drug_vendors['GBP_value'].sum()
print ('10% largest drug vendors market value {}'.format(largest_drug_value))
print ('Drugs Market total value: {} GBP'.format(total_drug_value))
print ('##############################')
largest_docs_value = sorted_docs_vendors.tail((len(sorted_docs_vendors))*10//100)['GBP_value'].sum()
total_docs_value = sorted_docs_vendors['GBP_value'].sum()
print ('10% largest fake documents vendors market value {}'.format(largest_docs_value))
print ('Fake Documents Market total value: {} GBP'.format(total_docs_value))
print ('##############################')
largest_cocaine_value = sorted_coke_vendors.tail((len(sorted_coke_vendors))*10//100)['GBP_value'].sum()
total_cocaine_value = sorted_coke_vendors['GBP_value'].sum()
print ('10% largest cocaine vendors market value {}'.format(largest_cocaine_value))
print ('Cocaine Market total value: {} GBP'.format(total_cocaine_value))
import matplotlib.pyplot as plt
labels = 'Largest Vendors (90th percentile)', 'Rest of the market'
sizes = [largest_value, total_value - largest_value]
colors = ['gold', 'lightskyblue']
explode = (0.1, 0)
plt.axis('equal')
plt.title('Largest vendors total value compared with the rest of the market')
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
Out[132]:
The Hypothesis is now confirmed, the largest vendors concentrate the majority of the market value. This becomes even more prevalent within the drugs marke (which remeber, copes 85% of the total market) Having such concentration of the value within a small vendors is a factor that points us to the fact that the agents behind these markets are not small independant vendors but rather larger criminal organizations.
However, while this hypothesis is supported by the evidence, it doesn't fully prove to what extent this criminal organizations may be operating behind these sites.
In our attempt to unveil the size and behaviour of the agents operating in the site, it would be helpfull to undrestand the range of countris from which these vendors are provided. the hypothesis is that the vendors not only operate locally, but rather represented by larger criminal organizations that operate at an international scope.
In [13]:
vendors_geo_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.product_origin) as countries FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
GROUP BY B.vendor_id
ORDER BY GBP_value;
''', index_col='vendor_id')
vendors_geo_df['countries'].mean()
Out[13]:
In [14]:
vendors_geo_df[(vendors_geo_df['GBP_value'] > vendors_geo_df['GBP_value'].quantile(.9))]['countries'].mean()
Out[14]:
In [15]:
drug_vendors_geo_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.product_origin) as countries FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Weight loss')
)
GROUP BY B.vendor_id;
''', index_col='vendor_id')
coke_vendors_geo_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.product_origin) as countries FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('Cocaine')
)
GROUP BY B.vendor_id;
''', index_col='vendor_id')
docs_vendors_geo_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.product_origin) as countries FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE B.subCategory_id in
(SELECT subCategory_id FROM tblSubCategory where subCategory_name = 'Physical documents')
AND (B.product_name LIKE '%drivers license%'
OR B.product_name LIKE '%driving licence%'
OR B.product_name LIKE '%drivers licence%'
OR B.product_name LIKE '%passport%'
OR B.product_name LIKE '%fake id%'
OR B.product_name LIKE '%resident id%'
OR B.product_name LIKE '%passport%')
GROUP BY B.vendor_id
''', index_col='vendor_id')
In [16]:
sorted_vendors = vendors_df.sort_values('GBP_value')
sorted_drug_geo_vendors = drug_vendors_geo_df.sort_values('GBP_value')
sorted_docs_geo_vendors = docs_vendors_geo_df.sort_values('GBP_value')
sorted_coke_geo_vendors = coke_vendors_geo_df.sort_values('GBP_value')
In [17]:
print ('10% largest drug vendors countries avg {}'.format(
sorted_drug_geo_vendors.tail((len(sorted_drug_geo_vendors))*10//100)['countries'].mean())
)
print ('10% largest fake documents vendors countries avg {}'.format(
sorted_docs_geo_vendors.tail((len(sorted_docs_geo_vendors))*10//100)['countries'].mean())
)
print ('10% largest cocaine vendors countries avg {}'.format(
sorted_coke_geo_vendors.tail((len(sorted_coke_geo_vendors))*10//100)['countries'].mean())
)
In [ ]:
In [28]:
print ('10 largest drug vendors countries avg {}'.format(
sorted_drug_geo_vendors.tail(10)['countries'].mean())
)
print ('10 largest fake documents vendors countries avg {}'.format(
sorted_docs_geo_vendors.tail(10)['countries'].mean())
)
print ('10 largest cocaine vendors countries avg {}'.format(
sorted_coke_geo_vendors.tail(10)['countries'].mean())
)
In [103]:
vendors_cat_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.subCategory_id) as categories FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
GROUP BY B.vendor_id
ORDER BY GBP_value;
''', index_col='vendor_id')
drug_vendors_cat_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.subCategory_id) as categories FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('2C', '5-MeO', 'Barbiturates', 'Benzos', 'Buprenorphine', 'Cannabis', 'Chemicals', 'Cocaine',
'Codeine', 'Concentrates', 'Containers', 'Dihydrocodeine', 'Dissociatives', 'DMT', 'Doomsday',
'Drugs', 'Ecstasy', 'Edibles', 'Fentanyl', 'Filters', 'Fireworks', 'GBL', 'GHB', 'Grinders',
'Hash', 'Heroin', 'Hydrocodone', 'Hydromorphone', 'Injecting equipment', 'Ketamine', 'LSD', 'MDA',
'MDMA', 'Mephedrone', 'Mescaline', 'Meth', 'Morphine', 'Mushrooms', 'MXE', 'NB', 'Needles', 'Opioids',
'Opium', 'Oxycodone', 'Paper', 'Paraphernalia', 'PCP', 'Pills', 'Pipes', 'Prescription', 'Psychedelics',
'RCs', 'Salvia', 'Scales', 'Science', 'Seeds', 'Shake/trim', 'Smoked', 'Speed', 'Spores', 'Stashes',
'Steroids', 'Stimulants', 'Synthetics', 'Syringes', 'Weed', 'Weight loss')
)
GROUP BY B.vendor_id;
''', index_col='vendor_id')
coke_vendors_cat_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.subCategory_id) as categories FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE subCategory_id in
(SELECT subCategory_id from tblSubCategory where subCategory_name IN
('Cocaine')
)
GROUP BY B.vendor_id;
''', index_col='vendor_id')
docs_vendors_cat_df = db.pandas_query('''
SELECT A.vendor_id, SUM(B.GBP) as GBP_value, COUNT(DISTINCT B.subCategory_id) as categories FROM tblVendor A
JOIN tblProduct B
ON B.vendor_id = A.vendor_id
WHERE B.subCategory_id in
(SELECT subCategory_id FROM tblSubCategory where subCategory_name = 'Physical documents')
AND (B.product_name LIKE '%drivers license%'
OR B.product_name LIKE '%driving licence%'
OR B.product_name LIKE '%drivers licence%'
OR B.product_name LIKE '%passport%'
OR B.product_name LIKE '%fake id%'
OR B.product_name LIKE '%resident id%'
OR B.product_name LIKE '%passport%')
GROUP BY B.vendor_id
''', index_col='vendor_id')
In [104]:
sorted_cat_vendors = vendors_cat_df.sort_values('GBP_value')
sorted_drug_cat_vendors = drug_vendors_cat_df.sort_values('GBP_value')
sorted_docs_cat_vendors = docs_vendors_cat_df.sort_values('GBP_value')
sorted_coke_cat_vendors = coke_vendors_cat_df.sort_values('GBP_value')
In [109]:
print ('10% largest vendors categories avg {}'.format(
sorted_cat_vendors.tail((len(sorted_cat_vendors))*10//100)['categories'].mean())
)
print ('10% largest drug vendors categories avg {}'.format(
sorted_drug_cat_vendors.tail((len(sorted_drug_cat_vendors))*10//100)['categories'].mean())
)
print ('10% largest fake documents vendors categories avg {}'.format(
sorted_docs_cat_vendors.tail((len(sorted_docs_cat_vendors))*10//100)['categories'].mean())
)
In [108]:
print ('10 largest drug vendors categories avg {}'.format(
sorted_drug_cat_vendors.tail(10)['categories'].mean())
)
print ('10 largest fake documents vendors categories avg {}'.format(
sorted_docs_cat_vendors.tail(10)['categories'].mean())
)
In [ ]: