python爬虫

urllib2的运用

urllib2获取网页内容时可以设定超时时间，防止运行过程中假死；在获取网页时需要做异常处理，防止部分网页访问不到程序终止。

#coding:utf-8
import urllib2
try:
    url = "http://www.baidu.com"
    f = urllib2.urlopen(url, timeout=0) #timeout设置超时的时间
    result = f.read()
    print result
except Exception,e:
    print 'a',str(e)

结果：

1	a <urlopen error timed out>

异常处理
如果想在代码中处理URLError和HTTPError有两种方法，代码如下：

#! /usr/bin/env python
#coding=utf-8
import urllib2
url="xxxxxx"  #需要访问的URL
try:
    response=urllib2.urlopen(url)
except urllib2.HTTPError,e:    #HTTPError必须排在URLError的前面
    print "The server couldn't fulfill the request"
    print "Error code:",e.code
    print "Return content:",e.read()
except urllib2.URLError,e:
    print "Failed to reach the server"
    print "The reason:",e.reason
else:
    #something you should do
    pass  #其他异常的处理

#! /usr/bin/env python
#coding=utf-8
import urllib2
url="http://xxx"  #需要访问的URL
try:
    response=urllib2.urlopen(url)
except urllib2.URLError,e:
    if hasattr(e,"reason"):
        print "Failed to reach the server"
        print "The reason:",e.reason
    elif hasattr(e,"code"):
        print "The server couldn't fulfill the request"
        print "Error code:",e.code
        print "Return content:",e.read()
else:
    pass  #其他异常的处理

相比较而言，第二种异常处理方法更优。

re正则的运用

字符匹配过成功中，re匹配模式很难定义，主要还是不熟；主要用到以下几种：

1 2	p = re.compile(r'''<a href="http://wiki.jikexueyuan.com/project/start-learning-python/(.+?).html" >.+?</a>''') matchs = p.findall(contents)

这个主要匹配出所有的网页名称出来，结果是得到一个包含list的tuple列表；

1 2	linkpatten = re.compile(r'''href="/.+?/css''') contents = re.sub(linkpatten, r'href="./css', contents)

这个主要是将contents中的href=”/.+?/css字符串替换成./css形式，所有的哦。

1 2	linkpatten = re.compile(r'''href="/assets/(.+?)\.ico"''') contents = re.sub(linkpatten, r'href="./images/\g<1>.ico"', contents)

这个主要是将contents中的href=”/assets/(.+?).ico”字符串并提取ico的名称用来后台替换，替换成href=”./images/\g<1>.ico”形式，其中\g<1>部分或用提取出来的名称替换，同样是所有的哦.

小结
正则中（）代表需要提取出来的内容；re.sub(linkpatten, r’href=”./images/\g<1>.ico”‘, contents)解析；反斜杠加g以及中括号内一个名字，即：\g，对应着命了名的组，named group；文件保存推荐使用

1 2	with open(filename, 'w') as cssfile: cssfile.write(req.read())

同样需要注意的是在保存图片文件(png或jpg)时，写入模式是“w+b”（以二进制形式保存），但是有的网站图片可能是压缩过的，保存下来是看不了的，可以通过如下格式保存：

with open(filename,"wb") as code:
  decompresser = zlib.decompressobj(16+zlib.MAX_WBITS)
  data = decompresser.decompress(f.read())
  code.write(data)

目前不知道如何判断图片是否压缩及压缩格式，有知道的可以交流下。

贪婪 vs 不贪婪
当重复一个正则表达式时，如用 a*，操作结果是尽可能多地匹配模式。当你试着匹配一对对称的定界符，如 HTML 标志中的尖括号时这个事实经常困扰你。匹配单个 HTML 标志的模式不能正常工作，因为 .* 的本质是“贪婪”的

#!python
>>> s = '<html><head><title>Title</title>'
>>> len(s)
32
>>> print re.match('<.*>', s).span()
(0, 32)
>>> print re.match('<.*>', s).group()
<html><head><title>Title</title>

RE 匹配在 ““ 中的 “<”，.* 消耗掉字符串的剩余部分。在 RE 中保持更多的左，虽然 > 不能匹配在字符串结尾，因此正则表达式必须一个字符一个字符地回溯，直到它找到 > 的匹配。最终的匹配从 ““ 中的 “>”,这并不是你所想要的结果。

在这种情况下，解决方案是使用不贪婪的限定符 *?、+?、?? 或 {m,n}?，尽可能匹配小的文本。在上面的例子里， “>” 在第一个 “<” 之后被立即尝试，当它失败时，引擎一次增加一个字符，并在每步重试 “>”。这个处理将得到正确的结果：

1
2
3

#!python
>>> print re.match('<.*?>', s).group()
<html>

###最后附上最近写的原始版的极客python教程内容爬取的代码

# -*- coding:utf-8 -*-
import re
import urllib2
import logging
import time

import zlib

logging.basicConfig(level=logging.INFO)
"""
"""
main_dir = r'D:\mugwort\book\python\base_python'
main_url = r'http://wiki.jikexueyuan.com/project/start-learning-python/'


def sava_css(contents):
    p = re.compile(r'''<link rel="stylesheet" type="text/css" href="(.+?)">''')
    matchs = p.findall(contents)
    for match in matchs:
        logging.info(match)
        if not match.startswith("//"):
            match = "http://wiki.jikexueyuan.com" + match
        else:
            match = "http:" + match
        logging.info("format:" + match)
        req = urllib2.urlopen(match)
        contents = req.read()
        names = re.compile(r'''.+?/(.*?)\.css''').findall(match)
        name = names.pop(len(names) - 1)
        logging.info(name)
        logging.info("rindex %d  length %d " % (name.rindex("/") + 1, len(name)))
        logging.info("name format:" + name[name.rindex("/") + 1:len(name)])
        filename = main_dir + '\\css\\' + name[name.rindex("/") + 1:len(name)] + ".css"
        if filename:
            logging.info(filename)
            with open(filename, 'w') as cssfile:
                cssfile.write(contents)


def sava_js(contents):
    p = re.compile(r'''<script type="text/javascript" src="(.+?)">''')
    matchs = p.findall(contents)
    for match in matchs:
        logging.info(match)
        if not match.startswith("//"):
            match = "http://wiki.jikexueyuan.com" + match
        else:
            match = "http:" + match
        logging.info("format:" + match)
        req = urllib2.urlopen(match)
        contents = req.read()
        names = re.compile(r'''.+?/(.*?)\.js''').findall(match)
        logging.info(names)
        name = names.pop(len(names) - 1)
        logging.info(name)
        logging.info("rindex %d  length %d " % (name.rindex("/") + 1, len(name)))
        logging.info("name format:" + name[name.rindex("/") + 1:len(name)])
        filename = main_dir + '\\js\\' + name[name.rindex("/") + 1:len(name)] + ".js"
        if filename:
            logging.info(filename)
            with open(filename, 'w') as cssfile:
                cssfile.write(contents)


def sava_img(contents):
    p = re.compile(r'''<img src="(.+?)".+?>|<link.+?href="(.+?)".+?type="image/x-icon".+?>''')
    matchs = p.findall(contents)
    for matchT in matchs:
        logging.info(matchT)
        match = matchT[0]
        if not match:
            match = matchT[1]
        logging.info(match)
        if not match.startswith("//"):
            match = "http://wiki.jikexueyuan.com" + match
        else:
            match = "http:" + match
        logging.info("format:" + match)
        logging.info("rindex %d  length %d " % (match.rindex(r"."), len(match)))
        subfix = match[match.rindex("."):len(match)]
        logging.info("subfix:" + subfix)
        try:
            req = urllib2.urlopen(match)
            names = re.compile(r'''.+?/(.*?)\.png|.+?/(.*?)\.jpg|.+?/(.*?)\.ico''').findall(match)
            logging.info(names)
            nameT = names.pop(0)
            name = nameT[0]
            if not name:
                name = nameT[1]
            if not name:
                name = nameT[2]
            logging.info(name)
            logging.info("rindex %d  length %d " % (name.rindex("/") + 1, len(name)))
            logging.info("name format:" + name[name.rindex("/") + 1:len(name)])
            filename = main_dir + '\\images\\' + name[name.rindex("/") + 1:len(name)] + subfix
            if filename:
                logging.info(filename)
                # 对于png文件需要解压且以二进制保存
                if ".png" == subfix:
                    with open(filename, 'w+b') as cssfile:
                        cssfile.write(req.read())
                # 对于jpg文件需要以二进制写入
                elif ".jpg" == subfix:
                    with open(filename, 'w+b') as cssfile:
                        cssfile.write(req.read())
                else:
                    with open(filename, 'w') as cssfile:
                        cssfile.write(req.read())
        except urllib2.URLError, e:
            if hasattr(e, "reason"):
                print "The reason:", e.reason
            elif hasattr(e, "code"):
                print "Error code:", e.code
                print "Return content:", e.read()
        else:
            pass  # 其他异常处理


def save_html(urlname):
    # 打开并保存hmtl
    url = main_url + urlname + '.html'
    file_name = main_dir + '\\' + urlname + '.html'
    req = urllib2.urlopen(url)
    contents = req.read()
    sava_css(contents)
    sava_js(contents)
    sava_img(contents)
    # 替换掉地址信息，让其变为本地
    # 规范css文件保存
    contents = re.sub(main_url, r"", contents)
    linkpatten = re.compile(r'''href="/.+?/css''')
    contents = re.sub(linkpatten, r'href="./css', contents)
    # 规范js文件保存
    linkpatten = re.compile(r'''src=".*?/js/(.+?)\.js.*?"''')
    contents = re.sub(linkpatten, r'src="./js/\g<1>.js"', contents)
    linkpatten = re.compile(r"src='.*?/js/(.+?)\.js.*?'")
    contents = re.sub(linkpatten, r'src="./js/\g<1>.js"', contents)
    # 规范js文件保存
    linkpatten = re.compile(r'''src=".*?/src/(.+?)\.js.*?"''')
    contents = re.sub(linkpatten, r'src="./js/\g<1>.js"', contents)
    # 规范img文件保存
    linkpatten = re.compile(r'''src=".*?/images/(.+?)\.png"''')
    contents = re.sub(linkpatten, r'src="./images/\g<1>.png"', contents)
    linkpatten = re.compile(r'''src=".*?/images/(.+?)\.jpg"''')
    contents = re.sub(linkpatten, r'src="./images/\g<1>.jpg"', contents)
    linkpatten = re.compile(r'''href="/assets/(.+?)\.ico"''')
    contents = re.sub(linkpatten, r'href="./images/\g<1>.ico"', contents)

    with open(file_name, 'w') as urlfile:
        urlfile.write(contents)


req = urllib2.urlopen(r'http://wiki.jikexueyuan.com/project/start-learning-python/')
p = re.compile(r'''<a href="http://wiki.jikexueyuan.com/project/start-learning-python/(.+?).html" >.+?</a>''')
contents = req.read().decode("utf-8")
matchs = p.findall(contents)
logging.info(len(matchs))
logging.info(time.time())
for row in matchs:
    save_html(row)
logging.info(time.time())

参考地址：

正则速查表 http://www.jb51.net/shouce/jquery1.82/regexp.html
urllib2异常处理 http://wangxiaoxu.iteye.com/blog/1844989
Python正则表达式操作指南 http://wiki.ubuntu.org.cn/Python%E6%AD%A3%E5%88%99%E8%A1%A8%E8%BE%BE%E5%BC%8F%E6%93%8D%E4%BD%9C%E6%8C%87%E5%8D%97
re.sub使用示例 http://www.programcreek.com/python/example/21/re.sub
python替换函数 http://blog.csdn.net/zcmlimi/article/details/47709049
re.sub 五个参数解析 http://www.crifan.com/python_re_sub_detailed_introduction/
压缩问题 http://stackoverflow.com/questions/3122145/zlib-error-error-3-while-decompressing-incorrect-header-check