1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
| import re from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup from selenium.webdriver.common.action_chains import ActionChains import time from selenium.webdriver.chrome.options import Options
chromedriver_path = 'E:\\kimi_analyze\\chromedriver-win64\\chromedriver.exe'
options = Options() options.add_argument('lang=zh_CN.UTF-8') options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"') options.add_argument(r'--user-data-dir=C:/Users/A/AppData/Local/Google/Chrome/User Data')
driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
url = 'https://kimi.moonshot.cn/chat/cqg7m2mcp7f585g8lki0' driver.get(url)
time.sleep(3)
driver.find_element(By.TAG_NAME, 'body').click() time.sleep(1)
chapters = {} chapter_pattern = re.compile(r'第[一二三四五六七八九十百千]+章')
actions = ActionChains(driver)
extracted_chapters = set()
for _ in range(22): html_content = driver.page_source soup = BeautifulSoup(html_content, 'lxml') divs = soup.find_all('div', {'data-index': True}) for div in divs: text = div.get_text(separator='\n')
if len(text) > 100: chapter_matches = chapter_pattern.findall(text) if chapter_matches: for chapter in chapter_matches: if chapter not in extracted_chapters: extracted_chapters.add(chapter) chapters[chapter] = chapters.get(chapter, '') + '\n' + text actions.key_down(Keys.PAGE_UP).key_up(Keys.PAGE_UP).perform() time.sleep(1)
driver.quit()
def clean_text(text): text = re.sub(r'\n([:])', r'\1', text) text = re.sub(r'\n{2,}', '\n', text) return text
def format_lines(text, chapter_counter): lines = text.split('\n') new_lines = [] current_counter = 1 for line in lines: if ':' in line: parts = line.split(':', 1) if len(parts[0].strip()) <= 20: new_line = f"\n{chapter_counter}. **{parts[0]}**:{parts[1]}" chapter_counter += 1 else: new_line = line else: new_line = line new_lines.append(new_line) return '\n'.join(new_lines), chapter_counter
def extract_chapter_number(chapter_title): match = re.search(r'第(\d+)', chapter_title) return int(match.group(1)) if match else 0
def format_chapters(chapters): formatted_chapters = {} for chapter in sorted(chapters.keys(), key=lambda x: extract_chapter_number(x)): content = clean_text(chapters[chapter]) formatted_content, _ = format_lines(content, 1) formatted_chapters[chapter] = formatted_content return formatted_chapters
def remove_unwanted_text(text): start_str = '复制\n再试一次\n分享' start_index = text.find(start_str) if start_index != -1: next_chapter_match = chapter_pattern.search(text, start_index) if next_chapter_match: next_chapter_index = next_chapter_match.start() text = text[:start_index] + text[next_chapter_index:] else: text = text[:start_index] return text
chapters = format_chapters(chapters) chapters = {k: remove_unwanted_text(v) for k, v in chapters.items()}
def sort_chapters(chapters): def chapter_sort_key(chapter): match = re.search(r'第([一二三四五六七八九十百千]+)章', chapter) if match: num = match.group(1) num_dict = {'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10} return num_dict.get(num, 0) return 0 return sorted(chapters.items(), key=lambda item: chapter_sort_key(item[0]))
sorted_chapters = sort_chapters(chapters)
def save_chapters_to_file(chapters, file_path): with open(file_path, 'w', encoding='utf-8') as file: for chapter, content in chapters: file.write(f"{chapter}\n") file.write(f"{content}\n\n")
save_chapters_to_file(sorted_chapters, 'chapters.txt')
|
评论