文章归档友情连接照片地图

Python爬虫学习笔记

分类:Python编程  作者:rming  时间:2014-10-13
我想说`urllib2`确实很难用,直到有人跟我提到一个 requests的东西, 还是去玩requests吧. 2014.10.19

连通性测试

#!/usr/bin/env python
#coding=utf-8
import urllib2
def check(url = 'http://m.baidu.com' ):
request = urllib2.Request(url)
try:
urllib2.urlopen(request)
except urllib2.URLError , e:
if hasattr(e, 'code'):
print 'error code:' , e.code
if hasattr(e, 'reason'):
print 'error reason:', e.reason

测试代码

#!/usr/bin/env python
#coding:utf-8
import urllib
import urllib2
import cookielib
import prepare
prepare.check()
#urllib2
url = 'http://m.baidu.com'
queries = {
'name': 'rming',
'passwd': 'password',
}
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
data = urllib.urlencode(queries)
headers = {
'User-Agent': user_agent
}
request = urllib2.Request(url, data, headers)
#add header
request.add_header('Accept', 'text/html')
request.add_header('Referer','http://mail.qq.com')
print request.headers
responce = urllib2.urlopen(request , timeout=10)
html = responce.read()
print "info:",responce.info()
print "geturl:",responce.geturl()
print "length:",len(html)
pos = html.find('href')
print html[pos:pos+100]
#debug
httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler)
urllib2.install_opener(opener)
responce = urllib2.urlopen('http://m.qq.com')
#cookielib
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
responce = opener.open('http://www.baidu.com')
for item in cookie:
print item.name,'=>',item.value
#utf8编码
reload(sys)
sys.setdefaultencoding("utf8")
print sys.getdefaultencoding();
#raw_input
name = str(raw_input(u'请输入你的名字:\n'))
age = int(raw_input(u'请输入你的年龄:\n'))
print u"名字:",name
print u"年龄:",age

QA:

1.编码错误,报错:Non-ASCII character '\xe8' in file

        缺少声明: #coding:utf-8

2.编码错误,报错:'ascii' codec can't encode characters in position

        如果设置了上面的编码生命,则在中文字符串(u"字符串")前省略 "u" 试试.
        如果不行,则试用string 的 decode() 和 encode()
        也可以在文件开头(使用中文前) ,通过sys模块reload,然后设置编码

reload(sys)
sys.setdefaultencoding("utf8")


提交评论