In [1]:
urls = ['http://www.domain.com',
'https://somedomain.com',
'http://my-domain-123.net',
'https://google.com',
'http://www.foo.com',
'https://bar-baz3.com',
'ftp://domain2.com']
In [3]:
import re
In [28]:
# A complete match checking for the presence of some alphanumeric after the // followed
# by at least one group of .alphanumeric could be 'https://\w+(.\w+)+'
# I don't check if it ends with .com, .net or others but there are too many options here
[m.string for u in urls for m in [re.search('https://', u)] if m]
Out[28]:
In [31]:
[m.group(1) for u in urls for m in [re.search('https?://(\w+(.\w+)+)', u)] if m]
Out[31]:
In [32]:
languages = ['Ar','It','it','En','En_gb','jp','en_GB','EN_IE','en-NZ','en','es','ES-es']
In [37]:
sum([1 for c in languages for m in [re.search('^en', c, re.IGNORECASE)] if m])
Out[37]:
In [67]:
[m.group(1).lower() + ('-' + m.group(4).upper() if m.group(2) else '')\
for c in languages for m in [re.search('(^[A-Za-z]{2})((.)([A-Za-z]{2}$))?', c)] if m]
Out[67]: