In [1]:
import pandas as pd
from jeeves.preprocessing import rem_unicode, remove_digits, remove_punctuation, remove_stopwords

In [2]:
df = pd.read_table("cfp.tsv")

In [3]:
df.head()


Out[3]:
title n_votes n_comments year speaker_links content_urls speaker_info target_audience section type last_updated prerequisites description
0 Consuming Government Data with Python and D3 58 0 2015 Talks:You can reach me at GitHub and Twitter NaN Pratap Vardhan is a Data Scientist at Gramener... Intermediate Data Visualization and Analytics Talks 03 Sep, 2015 NaN The explosion of open data, especially governm...
1 Don’t get scared, Get Started! 19 1 2015 Tapasweni PathakShaifali Agrawal TBA Tapasweni PathakI have done bachelors in IT fr... Beginner Others Talks 31 May, 2015 Nothing.....other than your passion for coding. Opensource world is full of excitement, knowle...
2 Distributed scheduling leveraging multiple nod... 4 0 2015 http://glusterhacker.blogspot.in/ http://www.gluster.org/community/documentation... I am a software engineer at Red Hat Inc., work... Beginner Concurrency Talks 27 Apr, 2015 A basic understanding of how a distributed sys... Setting up a cron job in a machine, is perhaps...
3 Analyzing Python code with Pylint 22 0 2015 Pylint contributionsCPython contributions The project pageBitbucket forgeDocumentationPr... I'm an open source enthusiast coming from Roma... Intermediate Others Talks 21 Sep, 2015 The participants should have a basic understan... Given the dynamic nature of Python, some bugs ...
4 Python 3 Metaprogramming, Macros, Madness & More! 70 0 2015 NaN Github repository for the code that accompanie... Suhas is a Data Scientist at Gramener, and was... Intermediate Core Python Talks 03 Sep, 2015 NaN SummaryEver wanted to conquer the world, but j...

In [4]:
text_cols = "title speaker_info section target_audience type prerequisites description".split()
pipe = [rem_unicode, remove_digits, remove_punctuation, remove_stopwords, lambda x: x.lower()]
for col in text_cols:
    s = df.pop(col)
    for cleaner in pipe:
        s = s.astype(str).apply(cleaner)
    df[col] = s

In [5]:
df.head()


Out[5]:
n_votes n_comments year speaker_links content_urls last_updated title speaker_info section target_audience type prerequisites description
0 58 0 2015 Talks:You can reach me at GitHub and Twitter NaN 03 Sep, 2015 consuming government data python d pratap vardhan data scientist gramenercom data... data visualization analytics intermediate talks nan the explosion open data especially government ...
1 19 1 2015 Tapasweni PathakShaifali Agrawal TBA 31 May, 2015 dont get scared get started tapasweni pathaki done bachelors it igdtuw i w... others beginner talks nothingother passion coding opensource world full excitement knowledge enc...
2 4 0 2015 http://glusterhacker.blogspot.in/ http://www.gluster.org/community/documentation... 27 Apr, 2015 distributed scheduling leveraging multiple nod... i software engineer red hat inc working gluste... concurrency beginner talks a basic understanding distributed system works... setting cron job machine perhaps easiest way s...
3 22 0 2015 Pylint contributionsCPython contributions The project pageBitbucket forgeDocumentationPr... 21 Sep, 2015 analyzing python code pylint im open source enthusiast coming romania lead ... others intermediate talks the participants basic understanding python no... given dynamic nature python bugs tend creep co...
4 70 0 2015 NaN Github repository for the code that accompanie... 03 Sep, 2015 python metaprogramming macros madness more suhas data scientist gramener previously engin... core python intermediate talks nan summaryever wanted conquer world fell short kn...

In [6]:
df['speaker_link_present'] = False
df['content_url_present'] = False

In [7]:
import numpy as np

In [8]:
for col in df:
    if df[col].dtype is np.dtype('O'):
        df[col].fillna(value="", inplace=True)

In [9]:
URL_PATTERN = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
df.loc[df.speaker_links.str.contains(URL_PATTERN, case=False), "speaker_link_present"] = True
df.loc[df.content_urls.str.contains(URL_PATTERN, case=False), "content_url_present"] = True


/Users/jaidevd/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.
  from ipykernel import kernelapp as app
/Users/jaidevd/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.
  app.launch_new_instance()

In [10]:
df.content_url_present.sum() / float(df.shape[0])


Out[10]:
0.47058823529411764

In [11]:
del df['content_urls']
del df['speaker_links']

In [12]:
df['last_updated'] = pd.to_datetime(df.last_updated)

In [13]:
df.head()


Out[13]:
n_votes n_comments year last_updated title speaker_info section target_audience type prerequisites description speaker_link_present content_url_present
0 58 0 2015 2015-09-03 consuming government data python d pratap vardhan data scientist gramenercom data... data visualization analytics intermediate talks nan the explosion open data especially government ... False False
1 19 1 2015 2015-05-31 dont get scared get started tapasweni pathaki done bachelors it igdtuw i w... others beginner talks nothingother passion coding opensource world full excitement knowledge enc... False False
2 4 0 2015 2015-04-27 distributed scheduling leveraging multiple nod... i software engineer red hat inc working gluste... concurrency beginner talks a basic understanding distributed system works... setting cron job machine perhaps easiest way s... True True
3 22 0 2015 2015-09-21 analyzing python code pylint im open source enthusiast coming romania lead ... others intermediate talks the participants basic understanding python no... given dynamic nature python bugs tend creep co... False False
4 70 0 2015 2015-09-03 python metaprogramming macros madness more suhas data scientist gramener previously engin... core python intermediate talks nan summaryever wanted conquer world fell short kn... False False

In [14]:
deadline_16 = pd.to_datetime("1 July 2016")
deadline_15 = pd.to_datetime("1 June 2015")

In [15]:
df['deadlinediff'] = 0
df.loc[df.year == 2015, "deadlinediff"] = deadline_15 - df[df.year == 2015]['last_updated']
df.loc[df.year == 2016, "deadlinediff"] = deadline_16 - df[df.year == 2016]['last_updated']

In [16]:
del df['last_updated']

In [17]:
df.head()


Out[17]:
n_votes n_comments year title speaker_info section target_audience type prerequisites description speaker_link_present content_url_present deadlinediff
0 58 0 2015 consuming government data python d pratap vardhan data scientist gramenercom data... data visualization analytics intermediate talks nan the explosion open data especially government ... False False -94 days
1 19 1 2015 dont get scared get started tapasweni pathaki done bachelors it igdtuw i w... others beginner talks nothingother passion coding opensource world full excitement knowledge enc... False False 1 days
2 4 0 2015 distributed scheduling leveraging multiple nod... i software engineer red hat inc working gluste... concurrency beginner talks a basic understanding distributed system works... setting cron job machine perhaps easiest way s... True True 35 days
3 22 0 2015 analyzing python code pylint im open source enthusiast coming romania lead ... others intermediate talks the participants basic understanding python no... given dynamic nature python bugs tend creep co... False False -112 days
4 70 0 2015 python metaprogramming macros madness more suhas data scientist gramener previously engin... core python intermediate talks nan summaryever wanted conquer world fell short kn... False False -94 days

In [18]:
df.deadlinediff.min()


Out[18]:
Timedelta('-370 days +00:00:00')

In [19]:
df['deadlinediff'] = df.deadlinediff.apply(lambda x: x.days)

In [20]:
df.head()


Out[20]:
n_votes n_comments year title speaker_info section target_audience type prerequisites description speaker_link_present content_url_present deadlinediff
0 58 0 2015 consuming government data python d pratap vardhan data scientist gramenercom data... data visualization analytics intermediate talks nan the explosion open data especially government ... False False -94
1 19 1 2015 dont get scared get started tapasweni pathaki done bachelors it igdtuw i w... others beginner talks nothingother passion coding opensource world full excitement knowledge enc... False False 1
2 4 0 2015 distributed scheduling leveraging multiple nod... i software engineer red hat inc working gluste... concurrency beginner talks a basic understanding distributed system works... setting cron job machine perhaps easiest way s... True True 35
3 22 0 2015 analyzing python code pylint im open source enthusiast coming romania lead ... others intermediate talks the participants basic understanding python no... given dynamic nature python bugs tend creep co... False False -112
4 70 0 2015 python metaprogramming macros madness more suhas data scientist gramener previously engin... core python intermediate talks nan summaryever wanted conquer world fell short kn... False False -94

In [21]:
sel_2015 = """
        simple hacks to make your django website faster
        pretty printing in python
        machine learning techniques for building a large scale
        laying out your django projects
        python and riakdb
        building flexible filesystems with fuse-python
        symengine: the future fast core of computer algebra systems
        test driven development with ansible
        explore big data using simple python code
        introduction to nipype and how do we create
        python load balancer: 0 to 1 million requests per second
        creating, deployment & customizing
        building nextgen iot solutions
        consuming government data with python and d3
        python traceback for humans
        how to build microservices using zeromq and wsgi
        rip nagios. hello docker shinken
        building offensive web security framework in python
        how to detect phishing urls using pyspark decision trees
        fedmsg: the message bus of fedora infrastructure
        concurrent data processing in python
        analyzing arguments during a debate using natural language processing
        avoiding common pitfalls of datetime from a webapp
        python 2 metaprogramming, macros, madness & more
        rest apis - what, why and how
        solving logical puzzles with natural language processing
        getting started with ansible
        let's learn statistics
        using devstack to contribute to openstack
        building nextgen iot solutions using python and cloud
        reasoning under uncertainty with python
        python on your mobile phone(advanced concepts)
        django projects the right way
        symbolic computation with python, sympy
        thinking in functions
        """

sel_2016 = """
        hacking the python ast
        helix and salt: case study in high volume and distributed python applications
        realtime microservices with server side flux
        building an automatic keyphrase extraction system using nltk
        testing native binaries using cffi and py.test
        the trends in choosing licenses in python ecosystems
        good bye, call stack; hello, event driven architectures
        algorithmic music generation
        python byte code hacks
        load testing using locust.io
        continuous integration for data scientists
        building companion chatbot with python
        deploying your python backend with
        big data analysis using pyspark
        flying a drone
        containerize upstream projects effortlessly
        financial modelling and simulation with python
        micropython - porting python to microcontrollers
        creating a recommendation engine based on nlp and contextual
        open source health monitoring and evaluation systems
        concurrency in modern robots
        building a secure iot platform using paho and flask
        don't write tests, generate them
        real time sentiment analysis with apache storm and python
        building a lie detector: multi-modal sentiment analysis
        docker workshop
        talking to machines: optimizing neural networks with theano
        productive coding with pycharm
        demystifying the django rest framework
        scaling django with kubernetes
        """

sel_2015 = [l.rstrip().lstrip() for l in sel_2015.splitlines() if l]
sel_2016 = [l.rstrip().lstrip() for l in sel_2016.splitlines() if l]

In [22]:
df['selected'] = False

In [23]:
sel_2015 = [l for l in sel_2015 if l]
sel_2016 = [l for l in sel_2016 if l]

In [24]:
for proposal in sel_2015:
    proposal = remove_stopwords(proposal)
    proposal = remove_punctuation(proposal)
    sdf = df[df.year == 2015][df.title.str.contains(proposal, case=False)]
    if sdf.shape[0] != 1:
        print "2015", proposal
    else:
        df.loc[sdf.index[0], "selected"] = True
for proposal in sel_2016:
    proposal = remove_stopwords(proposal)
    proposal = remove_punctuation(proposal)
    sdf = df[df.year == 2016][df.title.str.contains(proposal, case=False)]
    if sdf.shape[0] != 1:
        print "2016", proposal
    else:
        df.loc[sdf.index[0], "selected"] = True


2015 simple hacks make django website faster
2015 python riakdb
2015 symengine future fast core computer algebra systems
2015 python load balancer 0 1 million requests per second
2015 consuming government data python d3
2015 python traceback humans
2015 fedmsg message bus fedora infrastructure
2015
/Users/jaidevd/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
/Users/jaidevd/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
 python 2 metaprogramming macros madness 
2016 trends choosing licenses python ecosystems
2016 flying drone
2016 talking machines optimizing neural networks theano

In [52]:
df[df.year == 2016][df.title.str.contains("theano", case=False)]


/Users/jaidevd/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  if __name__ == '__main__':
Out[52]:
n_votes n_comments year title speaker_info section target_audience type prerequisites description speaker_link_present content_url_present deadlinediff selected
174 19 1 2016 theano keras teaching python learn english i free software enthusiast researcher computer... scientific computing intermediate talks understanding machine learning algorithms favo... the main intention talk introduce people thean... False False 20 False
214 6 0 2016 talking machines optimizing neural networks t... deep learning robotics enthusiast presently fi... scientific computing intermediate workshops while topics introduced talk scratch familiari... with recent advances field deep learning compu... False False -27 False

In [53]:
df.loc[214, "selected"] = True

In [54]:
df.to_csv("tagged.tsv", index=False, sep="\t")

In [ ]: