python爬虫

  1. urllib2的运用

  urllib2获取网页内容时可以设定超时时间,防止运行过程中假死;在获取网页时需要做异常处理,防止部分网页访问不到程序终止。

1
2
3
4
5
6
7
8
9
#coding:utf-8
import urllib2
try:
url = "http://www.baidu.com"
f = urllib2.urlopen(url, timeout=0) #timeout设置超时的时间
result = f.read()
print result
except Exception,e:
print 'a',str(e)

结果:

1
a <urlopen error timed out>

异常处理
  如果想在代码中处理URLError和HTTPError有两种方法,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#! /usr/bin/env python
#coding=utf-8
import urllib2
url="xxxxxx" #需要访问的URL
try:
response=urllib2.urlopen(url)
except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面
print "The server couldn't fulfill the request"
print "Error code:",e.code
print "Return content:",e.read()
except urllib2.URLError,e:
print "Failed to reach the server"
print "The reason:",e.reason
else:
#something you should do
pass #其他异常的处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#! /usr/bin/env python
#coding=utf-8
import urllib2
url="http://xxx" #需要访问的URL
try:
response=urllib2.urlopen(url)
except urllib2.URLError,e:
if hasattr(e,"reason"):
print "Failed to reach the server"
print "The reason:",e.reason
elif hasattr(e,"code"):
print "The server couldn't fulfill the request"
print "Error code:",e.code
print "Return content:",e.read()
else:
pass #其他异常的处理

  相比较而言,第二种异常处理方法更优。

  1. re正则的运用

字符匹配过成功中,re匹配模式很难定义,主要还是不熟;主要用到以下几种:

1
2
p = re.compile(r'''<a href="http://wiki.jikexueyuan.com/project/start-learning-python/(.+?).html" >.+?</a>''')
matchs = p.findall(contents)

  这个主要匹配出所有的网页名称出来,结果是得到一个包含list的tuple列表;

1
2
linkpatten = re.compile(r'''href="/.+?/css''')
contents = re.sub(linkpatten, r'href="./css', contents)

  这个主要是将contents中的href=”/.+?/css字符串替换成./css形式,所有的哦。

1
2
linkpatten = re.compile(r'''href="/assets/(.+?)\.ico"''')
contents = re.sub(linkpatten, r'href="./images/\g<1>.ico"', contents)

  这个主要是将contents中的href=”/assets/(.+?).ico”字符串并提取ico的名称用来后台替换,替换成href=”./images/\g<1>.ico”形式,其中\g<1>部分或用提取出来的名称替换,同样是所有的哦.

小结
  正则中()代表需要提取出来的内容;re.sub(linkpatten, r’href=”./images/\g<1>.ico”‘, contents)解析;反斜杠加g以及中括号内一个名字,即:\g,对应着命了名的组,named group;文件保存推荐使用

1
2
with open(filename, 'w') as cssfile:
cssfile.write(req.read())

  同样需要注意的是在保存图片文件(png或jpg)时,写入模式是“w+b”(以二进制形式保存),但是有的网站图片可能是压缩过的,保存下来是看不了的,可以通过如下格式保存:

1
2
3
4
with open(filename,"wb") as code:
decompresser = zlib.decompressobj(16+zlib.MAX_WBITS)
data = decompresser.decompress(f.read())
code.write(data)

  目前不知道如何判断图片是否压缩及压缩格式,有知道的可以交流下。

  1. 贪婪 vs 不贪婪
      当重复一个正则表达式时,如用 a*,操作结果是尽可能多地匹配模式。当你试着匹配一对对称的定界符,如 HTML 标志中的尖括号时这个事实经常困扰你。匹配单个 HTML 标志的模式不能正常工作,因为 .* 的本质是“贪婪”的
1
2
3
4
5
6
7
8
#!python
>>> s = '<html><head><title>Title</title>'
>>> len(s)
32
>>> print re.match('<.*>', s).span()
(0, 32)
>>> print re.match('<.*>', s).group()
<html><head><title>Title</title>

  RE 匹配 在 ““ 中的 “<”,.* 消耗掉字符串的剩余部分。在 RE 中保持更多的左,虽然 > 不能匹配在字符串结尾,因此正则表达式必须一个字符一个字符地回溯,直到它找到 > 的匹配。最终的匹配从 ““ 中的 “>”,这并不是你所想要的结果。

  在这种情况下,解决方案是使用不贪婪的限定符 *?、+?、?? 或 {m,n}?,尽可能匹配小的文本。在上面的例子里, “>” 在第一个 “<” 之后被立即尝试,当它失败时,引擎一次增加一个字符,并在每步重试 “>”。这个处理将得到正确的结果:

1
2
3
#!python
>>> print re.match('<.*?>', s).group()
<html>

###最后附上最近写的原始版的极客python教程内容爬取的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding:utf-8 -*-
import re
import urllib2
import logging
import time

import zlib

logging.basicConfig(level=logging.INFO)
"""
"""
main_dir = r'D:\mugwort\book\python\base_python'
main_url = r'http://wiki.jikexueyuan.com/project/start-learning-python/'


def sava_css(contents):
p = re.compile(r'''<link rel="stylesheet" type="text/css" href="(.+?)">''')
matchs = p.findall(contents)
for match in matchs:
logging.info(match)
if not match.startswith("//"):
match = "http://wiki.jikexueyuan.com" + match
else:
match = "http:" + match
logging.info("format:" + match)
req = urllib2.urlopen(match)
contents = req.read()
names = re.compile(r'''.+?/(.*?)\.css''').findall(match)
name = names.pop(len(names) - 1)
logging.info(name)
logging.info("rindex %d length %d " % (name.rindex("/") + 1, len(name)))
logging.info("name format:" + name[name.rindex("/") + 1:len(name)])
filename = main_dir + '\\css\\' + name[name.rindex("/") + 1:len(name)] + ".css"
if filename:
logging.info(filename)
with open(filename, 'w') as cssfile:
cssfile.write(contents)


def sava_js(contents):
p = re.compile(r'''<script type="text/javascript" src="(.+?)">''')
matchs = p.findall(contents)
for match in matchs:
logging.info(match)
if not match.startswith("//"):
match = "http://wiki.jikexueyuan.com" + match
else:
match = "http:" + match
logging.info("format:" + match)
req = urllib2.urlopen(match)
contents = req.read()
names = re.compile(r'''.+?/(.*?)\.js''').findall(match)
logging.info(names)
name = names.pop(len(names) - 1)
logging.info(name)
logging.info("rindex %d length %d " % (name.rindex("/") + 1, len(name)))
logging.info("name format:" + name[name.rindex("/") + 1:len(name)])
filename = main_dir + '\\js\\' + name[name.rindex("/") + 1:len(name)] + ".js"
if filename:
logging.info(filename)
with open(filename, 'w') as cssfile:
cssfile.write(contents)


def sava_img(contents):
p = re.compile(r'''<img src="(.+?)".+?>|<link.+?href="(.+?)".+?type="image/x-icon".+?>''')
matchs = p.findall(contents)
for matchT in matchs:
logging.info(matchT)
match = matchT[0]
if not match:
match = matchT[1]
logging.info(match)
if not match.startswith("//"):
match = "http://wiki.jikexueyuan.com" + match
else:
match = "http:" + match
logging.info("format:" + match)
logging.info("rindex %d length %d " % (match.rindex(r"."), len(match)))
subfix = match[match.rindex("."):len(match)]
logging.info("subfix:" + subfix)
try:
req = urllib2.urlopen(match)
names = re.compile(r'''.+?/(.*?)\.png|.+?/(.*?)\.jpg|.+?/(.*?)\.ico''').findall(match)
logging.info(names)
nameT = names.pop(0)
name = nameT[0]
if not name:
name = nameT[1]
if not name:
name = nameT[2]
logging.info(name)
logging.info("rindex %d length %d " % (name.rindex("/") + 1, len(name)))
logging.info("name format:" + name[name.rindex("/") + 1:len(name)])
filename = main_dir + '\\images\\' + name[name.rindex("/") + 1:len(name)] + subfix
if filename:
logging.info(filename)
# 对于png文件需要解压且以二进制保存
if ".png" == subfix:
with open(filename, 'w+b') as cssfile:
cssfile.write(req.read())
# 对于jpg文件需要以二进制写入
elif ".jpg" == subfix:
with open(filename, 'w+b') as cssfile:
cssfile.write(req.read())
else:
with open(filename, 'w') as cssfile:
cssfile.write(req.read())
except urllib2.URLError, e:
if hasattr(e, "reason"):
print "The reason:", e.reason
elif hasattr(e, "code"):
print "Error code:", e.code
print "Return content:", e.read()
else:
pass # 其他异常处理


def save_html(urlname):
# 打开并保存hmtl
url = main_url + urlname + '.html'
file_name = main_dir + '\\' + urlname + '.html'
req = urllib2.urlopen(url)
contents = req.read()
sava_css(contents)
sava_js(contents)
sava_img(contents)
# 替换掉地址信息,让其变为本地
# 规范css文件保存
contents = re.sub(main_url, r"", contents)
linkpatten = re.compile(r'''href="/.+?/css''')
contents = re.sub(linkpatten, r'href="./css', contents)
# 规范js文件保存
linkpatten = re.compile(r'''src=".*?/js/(.+?)\.js.*?"''')
contents = re.sub(linkpatten, r'src="./js/\g<1>.js"', contents)
linkpatten = re.compile(r"src='.*?/js/(.+?)\.js.*?'")
contents = re.sub(linkpatten, r'src="./js/\g<1>.js"', contents)
# 规范js文件保存
linkpatten = re.compile(r'''src=".*?/src/(.+?)\.js.*?"''')
contents = re.sub(linkpatten, r'src="./js/\g<1>.js"', contents)
# 规范img文件保存
linkpatten = re.compile(r'''src=".*?/images/(.+?)\.png"''')
contents = re.sub(linkpatten, r'src="./images/\g<1>.png"', contents)
linkpatten = re.compile(r'''src=".*?/images/(.+?)\.jpg"''')
contents = re.sub(linkpatten, r'src="./images/\g<1>.jpg"', contents)
linkpatten = re.compile(r'''href="/assets/(.+?)\.ico"''')
contents = re.sub(linkpatten, r'href="./images/\g<1>.ico"', contents)

with open(file_name, 'w') as urlfile:
urlfile.write(contents)


req = urllib2.urlopen(r'http://wiki.jikexueyuan.com/project/start-learning-python/')
p = re.compile(r'''<a href="http://wiki.jikexueyuan.com/project/start-learning-python/(.+?).html" >.+?</a>''')
contents = req.read().decode("utf-8")
matchs = p.findall(contents)
logging.info(len(matchs))
logging.info(time.time())
for row in matchs:
save_html(row)
logging.info(time.time())

参考地址: