In [ ]:
import pandas as pd

In [ ]:
df = pd.read_table("cfp.tsv")
df['selected'] = False

In [ ]:
sel_2015 = {
    "talks": """
        simple hacks to make your django website faster
        pretty printing in python
        machine learning techniques for building a large scale
        laying out your django projects
        python and riak
        building flexible filesystems with fuse-python
        symengine: the future fast core of computer algebra systems
        test driven development with ansible
        explore big data using simple python code
        introduction to nipype and how do we create
        python load balancer: 0 to 1 million requests per second
        creating, deployment & customizing
        building nextgen iot solutions
        consuming government data with python and d3
        python traceback for humans
        how to build microservices using zeromq and wsgi
        rip nagios. hello docker shinken
        building offensive web security framework in python
        how to detect phishing urls using pyspark decision trees
        fedmsg: the message bus of fedora infrastructure
        concurrent data processing in python
        analyzing arguments during a debate using natural language processing
        avoiding common pitfalls of datetime from a webapp
        python 3 metaprogramming
        rest apis - what, why and how
        solving logical puzzles with natural language processing
""",
    "workshops": """
        getting started with ansible
        let's learn statistics
        using devstack to contribute to openstack
        building nextgen iot solutions using python and cloud
        reasoning under uncertainty with python
        python on your mobile phone(advanced concepts)
        django projects the right way
        symbolic computation with python, sympy
        thinking in functions
        """
    }

In [ ]:
sel_2016 = {
    "talks": """
        hacking the python ast
        helix and salt: case study in high volume and distributed python applications
        realtime microservices with server side flux
        building an automatic keyphrase extraction system using nltk
        testing native binaries using cffi and py.test
        the trends in choosing licenses in python ecosystem
        good bye, call stack; hello, event driven architectures
        algorithmic music generation
        python byte code hacks
        load testing using locust.io
        continuous integration for data scientists
        building companion chatbot with python
        deploying your python backend with
        big data analysis using pyspark
        flying a drone
        containerize upstream projects effortlessly
        financial modelling and simulation with python
        micropython - porting python to microcontrollers
        creating a recommendation engine based on nlp and contextual
        open source health monitoring and evaluation systems
        concurrency in modern robots
        building a secure iot platform using paho and flask
        don't write tests, generate them
        real time sentiment analysis with apache storm and python
    """,
    "workshops": """
        building a lie detector: multi-modal sentiment analysis
        docker workshop
        optimizing neural networks with theano
        productive coding with pycharm
        demystifying the django rest framework
        scaling django with kubernetes
    """
}

In [ ]:
# 2016
for prop_type, title in sel_2016.items():
    titles = title.splitlines()
    titles = [t.lstrip().rstrip() for t in titles]
    titles = [t for t in titles if t]
    for tl in titles:
        xdf = df[df.title.str.contains(tl, case=False)]
        if xdf.shape[0] != 1:
            print(tl)
        else:
            df.loc[xdf.index[0], "selected"] = True

In [ ]:
# 2015
for prop_type, title in sel_2015.items():
    titles = title.splitlines()
    titles = [t.lstrip().rstrip() for t in titles]
    titles = [t for t in titles if t]
    for tl in titles:
        xdf = df[df.title.str.contains(tl, case=False)]
        if xdf.shape[0] != 1:
            if xdf.shape[0] > 1:
                xdf = xdf[xdf['type'] == "Workshops"]
        else:
            df.loc[xdf.index[0], "selected"] = True

In [ ]:
df['selected'].value_counts()

In [ ]:
print(df['selected'].sum() / df.shape[0])

In [ ]:
df.to_csv('tagged.tsv', sep="\t", index=False)