下载小说

下载小说

写了一个可以下载小说的python代码. 大部分网站都可以使用, 有部分网站需要改一些参数.

1
2
3
4
5
6
7
8
9
from downloading_pics_and_novels import download_book


if __name__ == '__main__':
download_book(
url="https://ajnnan.com/52_52689/",
bookname="求魔"
)

我存在download_pics_and_novels中. 代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import re
import time
from threading import Thread
from time import sleep

import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By

import zipfile


def request(url, max_try=10):
"""
request(url, max_try=10):
调用 requests.get(url, headers) 函数 max_try 次直到获取 html 文件
"""
for _ in range(max_try):
try:
return requestUrl(url)
except SyntaxError:
pass
raise SyntaxError()


def requestUrl(url, sleeptime=.5):
"""
requestUrl(url)
在 requests.get() 函数上补充了 headers.
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.51 Safari/537.36"
}
requ = requests.get(url, headers=headers)
if requ.status_code != 200:
raise SyntaxError(f"ERROR: {requ.status_code}")
sleep(sleeptime)
return requ


class download_book:
"""
class download_book(self, url, bookname, index, pages):
url: 章节目录
bookname: 书名
index : dict 目录的搜索方式
pages : dict 正文的搜索方式

self:
base_url 是网站的主页
loads : list 是已下载的内容

def:
get_soup(url) 获取网页内容(BeautifulSoup)
get_url(url) 将网页相对地址补充成绝对地址(return)
write(title : str, article : list)
写入章节(title)和内容(article)
"""
def __init__(self, url, bookname, index=None, pages=None, base_url=None):
self.base_url = re.search(r"^.*?com", url).group(0) if base_url is None else base_url
self.bookname = bookname
self.loads = []
self.index = {"id": "list"} if index is None else index
self.pages = {"id": "content"} if pages is None else pages
if os.path.exists(self.bookname):
with open(self.bookname, "r", encoding="utf8") as file:
for line in file.readlines():
self.loads.append(line.strip())

self.get_index(url)

def get_index(self, url):
# 下载
for href in self.get_soup(url).find(**self.index).find_all("a"):
# 调用 self.get 获取章节名(title)和章节内容(article : list)
title, article = self.get(self.get_url(href['href']))

if title not in self.loads:
self.write(title, article)

def get(self, url):
soup = self.get_soup(url)

title = soup.find("h1").text.strip()
print(title)

art = []
for line in soup.find(**self.pages).contents:
line = line.text.strip()
if line != "":
art.append(line)
return title, art

def get_soup(self, url):
requ = request(url, max_try=20)
return BeautifulSoup(requ.content, "html.parser")

def get_url(self, url):
if url.startswith("http"):
return url
return self.base_url + url

def write(self, title, article):
with open(self.bookname, "a", encoding="utf8") as file:
file.write("title" + "\n")
with open(f"{self.bookname}.txt", "a", encoding="utf8") as file:
file.write(title + "\n\n")
file.write("\n\n\t".join(article))
file.write("\n\n")


def zipdir(dirName):
with zipfile.ZipFile(dirName + ".zip", "w", zipfile.ZIP_DEFLATED) as file:
for filename in os.listdir(dirName):
if not os.path.isdir(f"{dirName}/{filename}"):
file.write(f"{dirName}/{filename}")


def downloadPic(pic_url, file_name):
if os.path.exists(file_name):
print("\rexist " + file_name, end="")
return
img = request(pic_url).content
print("\rwrite " + file_name, end="")
with open(file_name, "wb") as file:
file.write(img)

文章作者: Letter Wu
文章链接: https://letterwu.github.io/2022/08/10/下载小说/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Have a nice day~ | Letter Wu's BLOG
支付宝打赏
微信打赏