2024-7-26-pdf切分章节与自动批量下载kimi回复并整理

7月 26 2024 日记 13 分钟读完 (约 1995 字)

txt版本切分章节不能包含图片，而且大多数电子书不是txt而是pdf或者epub。最近找到个很强大的电子书网站汇书网，不少书都能找到文字型pdf。修改pdf15mb以内的可以上iLovePDF | Online PDF tools for PDF lovers上修改标题以确定章节分隔标识符，pdf切分还有个好处是可以保留书中的插图，然而kimi、百度、阿里的ai目前还无法解读图片内容，所以艺术建筑装饰绘画类的书籍没法给出更进一步的启发，不过现在图像识别技术已经很成熟了估计再过段时间会支持。我测试了下按照章节切割数理类的书籍让kimi解读，也可以获得概要以及让它扮演提问型研究员和出题老师促进学习。

批量对指定文件夹下的pdf书籍切分章节输出到对应名字的文件夹

 

import os
import re
import fitz  # PyMuPDF

def split_chapters_pdf(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_folder, filename)
            pdf_document = fitz.open(pdf_path)
            base_filename = os.path.splitext(filename)[0]
            book_output_folder = os.path.join(output_folder, base_filename)

            if not os.path.exists(book_output_folder):
                os.makedirs(book_output_folder)

            # 提取文本并查找章节
            full_text = ""
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                full_text += page.get_text()

            # 输出提取的文本的前部分以进行调试
            print("提取的文本的前55个字符：")
            print(full_text[:55])

            # 按章节标题分割文本
            chapters = re.split(r'(第\s*\d+\s*章+aa)', full_text)
            print(f"找到 {len(chapters) - 1} 章节")

            # 处理章节标题和内容
            for i in range(1, len(chapters), 2):
                chapter_title = chapters[i].strip()
                chapter_content = chapters[i + 1] if i + 1 < len(chapters) else ""

                # 输出每个章节的标题以进行调试
                print(f"正在处理章节：{chapter_title}")

                # 为每一章创建一个新的PDF
                output_pdf_path = os.path.join(book_output_folder, f"{chapter_title.replace(' ', '_')}.pdf")
                chapter_pdf = fitz.open()

                # 添加页面到新PDF
                chapter_start_found = False
                start_page = 0
                for page_num in range(len(pdf_document)):
                    page = pdf_document.load_page(page_num)
                    text = page.get_text()

                    # 查找章节标题
                    if chapter_title in text:
                        if not chapter_start_found:
                            start_page = page_num
                            chapter_start_found = True
                    elif chapter_start_found:
                        # 找到下一个章节标题或到达文档末尾
                        next_chapter_title = None
                        for next_title in chapters[i+2:]:
                            if re.match(r'(第\s*\d+\s*章+aa)', next_title):
                                next_chapter_title = next_title.strip()
                                break
                        if next_chapter_title and next_chapter_title in text:
                            end_page = page_num - 1
                            chapter_pdf.insert_pdf(pdf_document, from_page=start_page, to_page=end_page)
                            break
                        elif page_num == len(pdf_document) - 1:
                            chapter_pdf.insert_pdf(pdf_document, from_page=start_page, to_page=page_num)

                if chapter_pdf.page_count > 0:
                    chapter_pdf.save(output_pdf_path)
                    print(f"保存章节到：{output_pdf_path}")
                else:
                    print(f"没有找到章节内容：{chapter_title}")
                
                chapter_pdf.close()
            
            pdf_document.close()

input_folder = 'E:/kimi_analyze'
output_folder = 'E:/kimi_analyze'

split_chapters_pdf(input_folder, output_folder)

自动下载是由于批量章节以后，网页长度太长了，开始的时候我发现kimi没有滚动条，不知道为啥这两天开始有了，kimi的加载可能是某种js翻页异步处理，并非完全加载在页面中，因此ctrl f无法搜索定位。第一版本实现了摘录kimi网页中的特定内容，标识为包含第x章的长文，但是显示效果不好，看了下kimi上的markdown标记并没有在html中显现所以我需要用程序给它后期格式化。第二版通过对带有“：”的短标题定位添加标识，这样理论上来说添加到hexo可以自动出markdown的序号使梗概阅读看起来清晰明了，并添加书签能快速定位章节。

批量下载kimi的总结回复并格式化显示效果


import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.chrome.options import Options

# 初始化Selenium WebDriver
chromedriver_path = 'E:\\kimi_analyze\\chromedriver-win64\\chromedriver.exe'  # 替换为您本地的ChromeDriver路径

# 设置ChromeOptions，包括用户代理字符串
options = Options()
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
options.add_argument(r'--user-data-dir=C:/Users/A/AppData/Local/Google/Chrome/User Data')

# 创建Chrome浏览器实例
driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)

# 打开目标网页
url = 'https://kimi.moonshot.cn/chat/cqg7m2mcp7f585g8lki0'
driver.get(url)

# 让页面初始加载完成
time.sleep(3)

driver.find_element(By.TAG_NAME, 'body').click()
time.sleep(1) 

# 存储章节内容的字典
chapters = {}
chapter_pattern = re.compile(r'第[一二三四五六七八九十百千]+章')

# 初始化ActionChains
actions = ActionChains(driver)

# 用于存储已经提取的章节标题以避免重复
extracted_chapters = set()

# 模拟向上滚动以加载更多内容并提取数据
for _ in range(22):  # 根据需要调整滚动次数
    # 获取当前页面HTML内容
    html_content = driver.page_source
    
    # 解析HTML并提取章节内容
    soup = BeautifulSoup(html_content, 'lxml')
    divs = soup.find_all('div', {'data-index': True})
    
    for div in divs:
        text = div.get_text(separator='\n')  # 获取纯文本内容，并保留换行符

        if len(text) > 100:
            chapter_matches = chapter_pattern.findall(text)
            if chapter_matches:
                for chapter in chapter_matches:
                    if chapter not in extracted_chapters:
                        extracted_chapters.add(chapter)
                        chapters[chapter] = chapters.get(chapter, '') + '\n' + text
    
    # 使用ActionChains模拟向上滚动
    actions.key_down(Keys.PAGE_UP).key_up(Keys.PAGE_UP).perform()
    time.sleep(1)  # 等待页面加载新内容

# 关闭浏览器
driver.quit()

# 净化文本函数
def clean_text(text):
    # 移除引号前的换行符
    text = re.sub(r'\n([：])', r'\1', text)
    # 限制连续换行符不超过两个
    text = re.sub(r'\n{2,}', '\n', text)
    return text

# 为“：”处于同一行的文字字数不超过20个的行添加序号和Markdown加粗标记
def format_lines(text, chapter_counter):
    lines = text.split('\n')
    new_lines = []
    current_counter = 1  # 当前行号
    for line in lines:
        if '：' in line:
            parts = line.split('：', 1)
            # 只有当“：”前的文字长度不超过20个字符时，才进行序号和加粗标记
            if len(parts[0].strip()) <= 20:
                new_line = f"\n{chapter_counter}. **{parts[0]}**：{parts[1]}"
                chapter_counter += 1
            else:
                new_line = line
        else:
            new_line = line
        new_lines.append(new_line)
    return '\n'.join(new_lines), chapter_counter

# 提取章节标题中的数字部分（如果有）
def extract_chapter_number(chapter_title):
    match = re.search(r'第(\d+)', chapter_title)
    return int(match.group(1)) if match else 0

# 格式化章节内容
def format_chapters(chapters):
    formatted_chapters = {}
    for chapter in sorted(chapters.keys(), key=lambda x: extract_chapter_number(x)):
        content = clean_text(chapters[chapter])
        formatted_content, _ = format_lines(content, 1)
        formatted_chapters[chapter] = formatted_content
    return formatted_chapters

# 删除不需要的文本
def remove_unwanted_text(text):
    start_str = '复制\n再试一次\n分享'
    start_index = text.find(start_str)
    if start_index != -1:
        # 查找下一章的位置
        next_chapter_match = chapter_pattern.search(text, start_index)
        if next_chapter_match:
            next_chapter_index = next_chapter_match.start()
            # 保留从"复制\n再试一次\n分享"到下一章之间的内容
            text = text[:start_index] + text[next_chapter_index:]
        else:
            # 如果没有找到下一章，删除"复制\n再试一次\n分享"之后的所有内容
            text = text[:start_index]
    return text

# 处理章节内容
chapters = format_chapters(chapters)
chapters = {k: remove_unwanted_text(v) for k, v in chapters.items()}

# 按照章节顺序排序并保存到文件
def sort_chapters(chapters):
    def chapter_sort_key(chapter):
        match = re.search(r'第([一二三四五六七八九十百千]+)章', chapter)
        if match:
            num = match.group(1)
            # 将中文数字转换为阿拉伯数字
            num_dict = {'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10}
            return num_dict.get(num, 0)
        return 0
    
    return sorted(chapters.items(), key=lambda item: chapter_sort_key(item[0]))

sorted_chapters = sort_chapters(chapters)

def save_chapters_to_file(chapters, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for chapter, content in chapters:
            file.write(f"{chapter}\n")
            file.write(f"{content}\n\n")

save_chapters_to_file(sorted_chapters, 'chapters.txt')

#计算机网络 #人工智能 #python

2024-7-26-pdf切分章节与自动批量下载kimi回复并整理

评论

Your browser is out-of-date!