1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
|
import requests import re from urllib.parse import urljoin ''' def download(url, user_agent='wswp', robots=None, num_retries=5, proxies=None): print("Downloading:", url) headers = {'User-Agent': user_agent} try: resp = requests.get(url, headers=headers, proxies=proxies) html = resp.text # print(html) if resp.status_code >= 400: print("Downloading error:", resp.text) html = None if num_retries and 500 <= resp.status_code < 600: return download(url, num_retries - 1) except requests.exceptions.RequestException as e: print("Download error", e.reason) html = None return html
'''
def download(url, user_agent='wswp', robots=None, num_retries=5,proxies = None): print("Downloading:", url) headers = {'User-Agent':user_agent} try: resp = requests.get(url, headers=headers, proxies = proxies) html = resp.text if resp.status_code > 400: print("Download error", resp.text) html = None if num_retries and 500 <= resp.status_code < 600: return download(url,num_retries-1) except requests.exceptions.RequestException as e: print("Download error", e.reason) html = None return html
def craw_sitemap(url): sitemap = download(url) print(sitemap) links = re.findall(r'<loc>(.*?)</loc>', sitemap) print(links) for link in links: html = download(link)
import itertools
num_errors = 0 def craw_site(url, max_error=5): for page in itertools.count(1): pg_url = "{}{}".format(url, page) html = download(pg_url) if html is None: num_errors += 1 if num_errors == max_error: break
def get_links(html): webpage_regex = re.compile(r'a href="(.*?)">') return webpage_regex.findall(html)
from urllib import robotparser def get_robots_parser(robots_url): rp = robotparser.RobotFileParser() rp.set_url('http://example.python-scraping.com/robots.txt') rp.read() return rp def link_craw(start_url, link_regex, robots_url = None, user_agent = 'wswp'): if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) craw_queue = [start_url] seen = set(craw_queue) while craw_queue: url = craw_queue.pop() if rp.can_fetch(user_agent,url): html = download(url,user_agent=user_agent) else: print("Blocked by robots.txt", url) html = None if html is None: continue for link in get_links(html): if re.match(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen.add(abs_link) craw_queue.append(abs_link)
if __name__ == '__main__': link_craw('http://example.python-scraping.com', '/places/default/(index|view)',user_agent='BadCrawler')
|