爬虫第一天

关于内容的思考

书里有两处遗漏

正则匹配不当
1
link_craw('http://example.python-scraping.com', '/places/default/(index|view)',user_agent='BadCrawler')
这里应该是/place/default/这样的格式，因为re.match只匹配开头串，前面重复的部分也要写上
在requests这里讲解的时候代码少了个return html

代码编写

#!/usr/bin/env python3
# coding=utf-8
import requests
import re
from urllib.parse import urljoin
'''
def download(url, user_agent='wswp', robots=None, num_retries=5, proxies=None):
    print("Downloading:", url)
    headers = {'User-Agent': user_agent}
    try:
        resp = requests.get(url, headers=headers, proxies=proxies)
        html = resp.text
        # print(html)
        if resp.status_code >= 400:
            print("Downloading error:", resp.text)
            html = None
            if num_retries and 500 <= resp.status_code < 600:
                return download(url, num_retries - 1)
    except requests.exceptions.RequestException as e:
        print("Download error", e.reason)
        html = None
    return html

'''

def download(url, user_agent='wswp', robots=None, num_retries=5,proxies = None):
    print("Downloading:", url)
    headers = {'User-Agent':user_agent}
    try:
        resp = requests.get(url, headers=headers, proxies = proxies)
        html = resp.text
        if resp.status_code > 400:
            print("Download error", resp.text)
            html = None
            if num_retries and 500 <= resp.status_code < 600:
                return download(url,num_retries-1)
    except requests.exceptions.RequestException as e:
        print("Download error", e.reason)
        html = None
    return html

def craw_sitemap(url):
    sitemap = download(url)
    print(sitemap)
    links = re.findall(r'<loc>(.*?)</loc>', sitemap)
    print(links)
    for link in links:
        html = download(link)


import itertools



num_errors = 0
def craw_site(url, max_error=5):
    for page in itertools.count(1):
        pg_url = "{}{}".format(url, page)
        html = download(pg_url)
        if html is None:
            num_errors += 1
            if num_errors == max_error:
                break

def get_links(html):
    webpage_regex = re.compile(r'a href="(.*?)">')
    return webpage_regex.findall(html)

from urllib import robotparser
def get_robots_parser(robots_url):
    rp = robotparser.RobotFileParser()
    rp.set_url('http://example.python-scraping.com/robots.txt')
    rp.read()
    return rp
def link_craw(start_url, link_regex, robots_url = None, user_agent = 'wswp'):
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    craw_queue = [start_url]
    seen = set(craw_queue)
    while craw_queue:
        url = craw_queue.pop()
        if rp.can_fetch(user_agent,url):
            html = download(url,user_agent=user_agent)
        else:
            print("Blocked by robots.txt", url)
            html = None
        if html is None:
            continue
        for link in get_links(html):
            if re.match(link_regex, link):
                abs_link = urljoin(start_url, link)
                if abs_link not in seen:
                    seen.add(abs_link)
                    craw_queue.append(abs_link)


if __name__ == '__main__':
    # craw_sitemap("http://example.python-scraping.com/sitemap.xml")
    #craw_site("http://example.python-scraping.com/view/-")
    link_craw('http://example.python-scraping.com', '/places/default/(index|view)',user_agent='BadCrawler')

总结

链接爬取时需要关注重复链接
关注robots.txt
利用数据库id爬取网站
自己分析网页并善用正则匹配
requests库相对简单

本文作者：NoOne
本文地址： https://noonegroup.xyz/posts/3e78770/
版权声明：转载请注明出处！