1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
| import os import re import time from threading import Thread from time import sleep
import requests from bs4 import BeautifulSoup from selenium.webdriver import Chrome from selenium.webdriver.common.by import By
import zipfile
def request(url, max_try=10): """ request(url, max_try=10): 调用 requests.get(url, headers) 函数 max_try 次直到获取 html 文件 """ for _ in range(max_try): try: return requestUrl(url) except SyntaxError: pass raise SyntaxError()
def requestUrl(url, sleeptime=.5): """ requestUrl(url) 在 requests.get() 函数上补充了 headers. """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/99.0.4844.51 Safari/537.36" } requ = requests.get(url, headers=headers) if requ.status_code != 200: raise SyntaxError(f"ERROR: {requ.status_code}") sleep(sleeptime) return requ
class download_book: """ class download_book(self, url, bookname, index, pages): url: 章节目录 bookname: 书名 index : dict 目录的搜索方式 pages : dict 正文的搜索方式
self: base_url 是网站的主页 loads : list 是已下载的内容
def: get_soup(url) 获取网页内容(BeautifulSoup) get_url(url) 将网页相对地址补充成绝对地址(return) write(title : str, article : list) 写入章节(title)和内容(article) """ def __init__(self, url, bookname, index=None, pages=None, base_url=None): self.base_url = re.search(r"^.*?com", url).group(0) if base_url is None else base_url self.bookname = bookname self.loads = [] self.index = {"id": "list"} if index is None else index self.pages = {"id": "content"} if pages is None else pages if os.path.exists(self.bookname): with open(self.bookname, "r", encoding="utf8") as file: for line in file.readlines(): self.loads.append(line.strip())
self.get_index(url)
def get_index(self, url): for href in self.get_soup(url).find(**self.index).find_all("a"): title, article = self.get(self.get_url(href['href']))
if title not in self.loads: self.write(title, article)
def get(self, url): soup = self.get_soup(url)
title = soup.find("h1").text.strip() print(title)
art = [] for line in soup.find(**self.pages).contents: line = line.text.strip() if line != "": art.append(line) return title, art
def get_soup(self, url): requ = request(url, max_try=20) return BeautifulSoup(requ.content, "html.parser")
def get_url(self, url): if url.startswith("http"): return url return self.base_url + url
def write(self, title, article): with open(self.bookname, "a", encoding="utf8") as file: file.write("title" + "\n") with open(f"{self.bookname}.txt", "a", encoding="utf8") as file: file.write(title + "\n\n") file.write("\n\n\t".join(article)) file.write("\n\n")
def zipdir(dirName): with zipfile.ZipFile(dirName + ".zip", "w", zipfile.ZIP_DEFLATED) as file: for filename in os.listdir(dirName): if not os.path.isdir(f"{dirName}/{filename}"): file.write(f"{dirName}/{filename}")
def downloadPic(pic_url, file_name): if os.path.exists(file_name): print("\rexist " + file_name, end="") return img = request(pic_url).content print("\rwrite " + file_name, end="") with open(file_name, "wb") as file: file.write(img)
|