2024-7-26-pdf切分章节与自动批量下载kimi回复并整理

txt版本切分章节不能包含图片,而且大多数电子书不是txt而是pdf或者epub。最近找到个很强大的电子书网站汇书网,不少书都能找到文字型pdf。修改pdf15mb以内的可以上iLovePDF | Online PDF tools for PDF lovers上修改标题以确定章节分隔标识符,pdf切分还有个好处是可以保留书中的插图,然而kimi、百度、阿里的ai目前还无法解读图片内容,所以艺术建筑装饰绘画类的书籍没法给出更进一步的启发,不过现在图像识别技术已经很成熟了估计再过段时间会支持。我测试了下按照章节切割数理类的书籍让kimi解读,也可以获得概要以及让它扮演提问型研究员和出题老师促进学习。

批量对指定文件夹下的pdf书籍切分章节输出到对应名字的文件夹
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
 

import os
import re
import fitz # PyMuPDF

def split_chapters_pdf(input_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)

for filename in os.listdir(input_folder):
if filename.endswith(".pdf"):
pdf_path = os.path.join(input_folder, filename)
pdf_document = fitz.open(pdf_path)
base_filename = os.path.splitext(filename)[0]
book_output_folder = os.path.join(output_folder, base_filename)

if not os.path.exists(book_output_folder):
os.makedirs(book_output_folder)

# 提取文本并查找章节
full_text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
full_text += page.get_text()

# 输出提取的文本的前部分以进行调试
print("提取的文本的前55个字符:")
print(full_text[:55])

# 按章节标题分割文本
chapters = re.split(r'(第\s*\d+\s*章+aa)', full_text)
print(f"找到 {len(chapters) - 1} 章节")

# 处理章节标题和内容
for i in range(1, len(chapters), 2):
chapter_title = chapters[i].strip()
chapter_content = chapters[i + 1] if i + 1 < len(chapters) else ""

# 输出每个章节的标题以进行调试
print(f"正在处理章节:{chapter_title}")

# 为每一章创建一个新的PDF
output_pdf_path = os.path.join(book_output_folder, f"{chapter_title.replace(' ', '_')}.pdf")
chapter_pdf = fitz.open()

# 添加页面到新PDF
chapter_start_found = False
start_page = 0
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text()

# 查找章节标题
if chapter_title in text:
if not chapter_start_found:
start_page = page_num
chapter_start_found = True
elif chapter_start_found:
# 找到下一个章节标题或到达文档末尾
next_chapter_title = None
for next_title in chapters[i+2:]:
if re.match(r'(第\s*\d+\s*章+aa)', next_title):
next_chapter_title = next_title.strip()
break
if next_chapter_title and next_chapter_title in text:
end_page = page_num - 1
chapter_pdf.insert_pdf(pdf_document, from_page=start_page, to_page=end_page)
break
elif page_num == len(pdf_document) - 1:
chapter_pdf.insert_pdf(pdf_document, from_page=start_page, to_page=page_num)

if chapter_pdf.page_count > 0:
chapter_pdf.save(output_pdf_path)
print(f"保存章节到:{output_pdf_path}")
else:
print(f"没有找到章节内容:{chapter_title}")

chapter_pdf.close()

pdf_document.close()

input_folder = 'E:/kimi_analyze'
output_folder = 'E:/kimi_analyze'

split_chapters_pdf(input_folder, output_folder)


自动下载是由于批量章节以后,网页长度太长了,开始的时候我发现kimi没有滚动条,不知道为啥这两天开始有了,kimi的加载可能是某种js翻页异步处理,并非完全加载在页面中,因此ctrl f无法搜索定位。第一版本实现了摘录kimi网页中的特定内容,标识为包含第x章的长文,但是显示效果不好,看了下kimi上的markdown标记并没有在html中显现所以我需要用程序给它后期格式化。第二版通过对带有“:”的短标题定位添加标识,这样理论上来说添加到hexo可以自动出markdown的序号使梗概阅读看起来清晰明了,并添加书签能快速定位章节。

批量下载kimi的总结回复并格式化显示效果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.chrome.options import Options

# 初始化Selenium WebDriver
chromedriver_path = 'E:\\kimi_analyze\\chromedriver-win64\\chromedriver.exe' # 替换为您本地的ChromeDriver路径

# 设置ChromeOptions,包括用户代理字符串
options = Options()
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
options.add_argument(r'--user-data-dir=C:/Users/A/AppData/Local/Google/Chrome/User Data')

# 创建Chrome浏览器实例
driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)

# 打开目标网页
url = 'https://kimi.moonshot.cn/chat/cqg7m2mcp7f585g8lki0'
driver.get(url)

# 让页面初始加载完成
time.sleep(3)

driver.find_element(By.TAG_NAME, 'body').click()
time.sleep(1)

# 存储章节内容的字典
chapters = {}
chapter_pattern = re.compile(r'第[一二三四五六七八九十百千]+章')

# 初始化ActionChains
actions = ActionChains(driver)

# 用于存储已经提取的章节标题以避免重复
extracted_chapters = set()

# 模拟向上滚动以加载更多内容并提取数据
for _ in range(22): # 根据需要调整滚动次数
# 获取当前页面HTML内容
html_content = driver.page_source

# 解析HTML并提取章节内容
soup = BeautifulSoup(html_content, 'lxml')
divs = soup.find_all('div', {'data-index': True})

for div in divs:
text = div.get_text(separator='\n') # 获取纯文本内容,并保留换行符

if len(text) > 100:
chapter_matches = chapter_pattern.findall(text)
if chapter_matches:
for chapter in chapter_matches:
if chapter not in extracted_chapters:
extracted_chapters.add(chapter)
chapters[chapter] = chapters.get(chapter, '') + '\n' + text

# 使用ActionChains模拟向上滚动
actions.key_down(Keys.PAGE_UP).key_up(Keys.PAGE_UP).perform()
time.sleep(1) # 等待页面加载新内容

# 关闭浏览器
driver.quit()

# 净化文本函数
def clean_text(text):
# 移除引号前的换行符
text = re.sub(r'\n([:])', r'\1', text)
# 限制连续换行符不超过两个
text = re.sub(r'\n{2,}', '\n', text)
return text

# 为“:”处于同一行的文字字数不超过20个的行添加序号和Markdown加粗标记
def format_lines(text, chapter_counter):
lines = text.split('\n')
new_lines = []
current_counter = 1 # 当前行号
for line in lines:
if ':' in line:
parts = line.split(':', 1)
# 只有当“:”前的文字长度不超过20个字符时,才进行序号和加粗标记
if len(parts[0].strip()) <= 20:
new_line = f"\n{chapter_counter}. **{parts[0]}**:{parts[1]}"
chapter_counter += 1
else:
new_line = line
else:
new_line = line
new_lines.append(new_line)
return '\n'.join(new_lines), chapter_counter

# 提取章节标题中的数字部分(如果有)
def extract_chapter_number(chapter_title):
match = re.search(r'第(\d+)', chapter_title)
return int(match.group(1)) if match else 0

# 格式化章节内容
def format_chapters(chapters):
formatted_chapters = {}
for chapter in sorted(chapters.keys(), key=lambda x: extract_chapter_number(x)):
content = clean_text(chapters[chapter])
formatted_content, _ = format_lines(content, 1)
formatted_chapters[chapter] = formatted_content
return formatted_chapters

# 删除不需要的文本
def remove_unwanted_text(text):
start_str = '复制\n再试一次\n分享'
start_index = text.find(start_str)
if start_index != -1:
# 查找下一章的位置
next_chapter_match = chapter_pattern.search(text, start_index)
if next_chapter_match:
next_chapter_index = next_chapter_match.start()
# 保留从"复制\n再试一次\n分享"到下一章之间的内容
text = text[:start_index] + text[next_chapter_index:]
else:
# 如果没有找到下一章,删除"复制\n再试一次\n分享"之后的所有内容
text = text[:start_index]
return text

# 处理章节内容
chapters = format_chapters(chapters)
chapters = {k: remove_unwanted_text(v) for k, v in chapters.items()}

# 按照章节顺序排序并保存到文件
def sort_chapters(chapters):
def chapter_sort_key(chapter):
match = re.search(r'第([一二三四五六七八九十百千]+)章', chapter)
if match:
num = match.group(1)
# 将中文数字转换为阿拉伯数字
num_dict = {'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10}
return num_dict.get(num, 0)
return 0

return sorted(chapters.items(), key=lambda item: chapter_sort_key(item[0]))

sorted_chapters = sort_chapters(chapters)

def save_chapters_to_file(chapters, file_path):
with open(file_path, 'w', encoding='utf-8') as file:
for chapter, content in chapters:
file.write(f"{chapter}\n")
file.write(f"{content}\n\n")

save_chapters_to_file(sorted_chapters, 'chapters.txt')

2024-8-5-midjourney共享版批量做图提交定时暂停手动暂停 2024-7-25-Penda水的颂歌景观亭和海洋之森购物公园

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×