In [1]:
import urllib2
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

In [2]:
xpath = '//table/tbody/tr[2]/td[2]/textarea'
url_src = 'http://e-maxx.ru/algo/src_euler_function'
url = 'http://e-maxx.ru/algo/euler_function'

In [3]:
r = urllib2.urlopen(url_src)
body = r.read()
print 'type',type(body)
#body = unicode(body, 'utf=8', errors='replace')
ext = Selector(text=body).xpath('//textarea/text()').extract()


type <type 'str'>

In [4]:
#print ext[0]
ext[0].encode('utf8')
print ext[0].decode('utf8', 'ignore')


---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-4-7ca47da96189> in <module>()
      1 #print ext[0]
      2 ext[0].encode('utf8')
----> 3 print ext[0].decode('utf8', 'ignore')

/usr/lib/python2.7/encodings/utf_8.pyc in decode(input, errors)
     14 
     15 def decode(input, errors='strict'):
---> 16     return codecs.utf_8_decode(input, errors, True)
     17 
     18 class IncrementalEncoder(codecs.IncrementalEncoder):

UnicodeEncodeError: 'ascii' codec can't encode characters in position 5-11: ordinal not in range(128)

In [5]:
print ext[0].encode('utf8')
file = open('euler-ru.txt','w')
file.write(ext[0].encode('utf8'))


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-5-6b0a2bd14123> in <module>()
----> 1 print ext[0].encode('utf8')
      2 file = open('euler-ru.txt','w')
      3 file.write(ext[0].encode('utf8'))
      4 

IndexError: list index out of range

In [ ]:
r = urllib2.urlopen(url)
r.headers['content-type']

In [36]:
_, params = cgi.parse_header(r.headers.get('Content-Type', ''))
encoding = params.get('charset', 'unicode')
text = r.read().decode(encoding)
print url


---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
<ipython-input-36-9e41dd3f32ad> in <module>()
      1 _, params = cgi.parse_header(r.headers.get('Content-Type', ''))
      2 encoding = params.get('charset', 'unicode')
----> 3 text = r.read().decode(encoding)
      4 print url

LookupError: unknown encoding: unicode

In [ ]: