In [32]:
import pandas as pd
In [33]:
df = pd.read_csv('data_tau.csv')
In [34]:
df.head()
Out[34]:
To get the date of the title - we will need the following algorithm
To apply this algorithm, we need to be able to pick these words and digits from a string. For that we will use Regular Expression.
In [35]:
import re
In [36]:
test_string = "Hello world, welcome to 2016."
In [37]:
# We can pass the whole string and re.search will give the first occurence of the value
# re.search - This function searches for first occurrence of RE pattern within string.
a = re.search('Hello world, welcome to 2016',test_string)
In [38]:
a
Out[38]:
In [39]:
a.group()
Out[39]:
In [40]:
# Match the first letters in the string
a = re.search('.',test_string)
a.group()
Out[40]:
In [41]:
# Match all the letters in the string
a = re.search('.*',test_string)
a.group()
Out[41]:
In [42]:
a = re.search('Hello',test_string)
print(a)
Some basic symbols
?
The question mark indicates zero or one occurrences of the preceding element. For example, colou?r matches both "color" and "colour".
\*
The asterisk indicates zero or more occurrences of the preceding element. For example, ab*c matches "ac", "abc", "abbc", "abbbc", and so on.
\+
The plus sign indicates one or more occurrences of the preceding element. For example, ab+c matches "abc", "abbc", "abbbc", and so on, but not "ac".
In [43]:
a = re.search('\w.',test_string)
print(a)
In [44]:
a = re.search('\w*',test_string)
print(a)
In [45]:
string = '''In 2016, we are learning Text Analytics in Data Science 101
by scraping http://datatau.com'''
In [46]:
string = "In 2016, we are learning Text Analytics in Data Science 101 by scraping http://datatau.com"
Write a regex to pick the numbers 2016 from string above.
In [ ]:
Write a regex to pick the url link (http://xyz.com) from the string above
In [ ]:
In [47]:
df.head()
Out[47]:
In [48]:
df.tail()
Out[48]:
In [49]:
date_string = df['date'][0]
In [50]:
print(date_string)
In [51]:
re.search('hours',date_string)
Out[51]:
In [52]:
date_string = df['date'][50]
In [53]:
print(date_string)
In [54]:
# If hours is not there, we don't get any match
re.search('hours',date_string)
In [55]:
# Let us match the digit preceding the day text
day_search = re.search('\d+ day',date_string)
day_search
Out[55]:
In [56]:
days_string = day_search.group(0)
days_string
Out[56]:
In [57]:
days = days_string.split(' ')[0]
days
Out[57]:
In [58]:
def return_reg_ex_days(row):
days = ''
if re.search('hours',row['date']) is not None:
# print('hours',row['date'])
days = 1
else:
day_search = re.search('\d+ day',row['date'])
# print('day',day_search.group(0))
days = day_search.group(0).split(' ')[0]
#print(row,days)
return days
In [59]:
# Now we apply this function to each of the row in the dataframe
df['days'] = df.apply(return_reg_ex_days,axis=1)
In [60]:
df.head()
Out[60]:
In [61]:
df.tail()
Out[61]:
In [62]:
# Let us save to a dataframe
df.to_csv('data_tau_days.csv', index=False)