In [1]:
import requests
In [2]:
from io import BytesIO
In [3]:
import pandas as pd
In [4]:
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1WbYov7KrliIvh9Ei485zxPF27Wx7-CYFZliNj3hZ9WE"
In [5]:
stream = requests.get("{0}/export?format=xlsx".format(spreadsheet_url))
In [6]:
full_table = pd.read_excel(BytesIO(stream.content), sheetname="Project-DB")
In [7]:
full_table.head()
Out[7]:
In [8]:
curated = full_table[:38]
In [9]:
curated.columns
Out[9]:
The following columns are ignored for later processing:
In [10]:
matching = {
"url": "web_url",
"title": "title",
"description": "description_content",
"logo": "logo_url",
"code_repository": "repository_url",
"organization_name": "organization_name",
"code_license": "code_license",
"releasedate": "publishedAt"
}
In [11]:
projects_columns = ["id", "user_id", "title", "description_tech", "description_content",
"code_license", "data_license", "logo_url", "web_url", "repository_url",
"organization_name", "dev_support_needed", "general_support_needed",
"publishedAt", "createddAt", "updatedAt"]
In [12]:
projects_export = curated[list(matching.keys())].copy()
projects_export.columns = [matching[key] for key in projects_export.columns]
projects_export["id"] = projects_export.index + 1
In [13]:
for key in projects_columns:
if key not in projects_export.columns:
projects_export[key] = None
In [14]:
projects_export.head()
Out[14]:
In [15]:
projects_export.to_csv("projects.csv", index=False, header=False, columns=projects_columns)
In [ ]:
In [16]:
s = pd.DataFrame(curated["categories"].str.split(',').tolist()).stack()
print(len(s))
s.head()
Out[16]:
In [17]:
s.index = s.index.droplevel(-1)
In [18]:
s.name = 'category'
In [19]:
s.head()
Out[19]:
In [27]:
category_join = projects_export.join(s)
In [28]:
category_join.head()
Out[28]:
Load categories. Create dict to ID. Columns:
In [22]:
category_nodes_translations_columns = ["id", "category_node_id", "title", "locale"]
In [23]:
categories_german = pd.read_csv("category_nodes_translations.csv", header=None, names=category_nodes_translations_columns)
In [24]:
categories_german.head()
Out[24]:
In [25]:
title2id = dict(categories_german[["title", "category_node_id"]].itertuples(index=False))
Try to find categories from project table in the existing list. Will probably introduce a lot of missing values. A real string search (using different languages) would yield better results.
In [29]:
category_join["category_node_id"] = [title2id.get(category) for category in category_join["category"]]
In [32]:
project_categories_export = category_join[["id", "category_node_id", "category"]].copy()
In [34]:
project_categories_export.rename(columns={"id": "project_id"}, inplace=True)
In [36]:
project_categories_export.dropna(subset=["category_node_id"])
Out[36]:
Empty table.