0%

python爬虫学习1

爬虫第一天

关于内容的思考

书里有两处遗漏

  1. 正则匹配不当

    1
    link_craw('http://example.python-scraping.com', '/places/default/(index|view)',user_agent='BadCrawler')

    这里应该是/place/default/这样的格式,因为re.match只匹配开头串,前面重复的部分也要写上

  2. 在requests这里讲解的时候代码少了个return html

代码编写

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
# coding=utf-8
import requests
import re
from urllib.parse import urljoin
'''
def download(url, user_agent='wswp', robots=None, num_retries=5, proxies=None):
print("Downloading:", url)
headers = {'User-Agent': user_agent}
try:
resp = requests.get(url, headers=headers, proxies=proxies)
html = resp.text
# print(html)
if resp.status_code >= 400:
print("Downloading error:", resp.text)
html = None
if num_retries and 500 <= resp.status_code < 600:
return download(url, num_retries - 1)
except requests.exceptions.RequestException as e:
print("Download error", e.reason)
html = None
return html

'''

def download(url, user_agent='wswp', robots=None, num_retries=5,proxies = None):
print("Downloading:", url)
headers = {'User-Agent':user_agent}
try:
resp = requests.get(url, headers=headers, proxies = proxies)
html = resp.text
if resp.status_code > 400:
print("Download error", resp.text)
html = None
if num_retries and 500 <= resp.status_code < 600:
return download(url,num_retries-1)
except requests.exceptions.RequestException as e:
print("Download error", e.reason)
html = None
return html

def craw_sitemap(url):
sitemap = download(url)
print(sitemap)
links = re.findall(r'<loc>(.*?)</loc>', sitemap)
print(links)
for link in links:
html = download(link)


import itertools



num_errors = 0
def craw_site(url, max_error=5):
for page in itertools.count(1):
pg_url = "{}{}".format(url, page)
html = download(pg_url)
if html is None:
num_errors += 1
if num_errors == max_error:
break

def get_links(html):
webpage_regex = re.compile(r'a href="(.*?)">')
return webpage_regex.findall(html)

from urllib import robotparser
def get_robots_parser(robots_url):
rp = robotparser.RobotFileParser()
rp.set_url('http://example.python-scraping.com/robots.txt')
rp.read()
return rp
def link_craw(start_url, link_regex, robots_url = None, user_agent = 'wswp'):
if not robots_url:
robots_url = '{}/robots.txt'.format(start_url)
rp = get_robots_parser(robots_url)
craw_queue = [start_url]
seen = set(craw_queue)
while craw_queue:
url = craw_queue.pop()
if rp.can_fetch(user_agent,url):
html = download(url,user_agent=user_agent)
else:
print("Blocked by robots.txt", url)
html = None
if html is None:
continue
for link in get_links(html):
if re.match(link_regex, link):
abs_link = urljoin(start_url, link)
if abs_link not in seen:
seen.add(abs_link)
craw_queue.append(abs_link)


if __name__ == '__main__':
# craw_sitemap("http://example.python-scraping.com/sitemap.xml")
#craw_site("http://example.python-scraping.com/view/-")
link_craw('http://example.python-scraping.com', '/places/default/(index|view)',user_agent='BadCrawler')

总结

  1. 链接爬取时需要关注重复链接
  2. 关注robots.txt
  3. 利用数据库id爬取网站
  4. 自己分析网页并善用正则匹配
  5. requests库相对简单

本文作者:NoOne
本文地址https://noonegroup.xyz/posts/3e78770/
版权声明:转载请注明出处!