from bs4 import BeautifulSoup import os, re import html2text from tqdm.notebook import tqdm
converter = html2text.HTML2Text()
def clean_text(text): # 去除多余的空白字符 text = ' '.join(text.split())
# 处理 HTML 实体 text = BeautifulSoup(text, 'html.parser').text
# 处理换行符 text = '\n'.join(line.strip() forline in text.splitlines() ifline.strip())
# 去除特定的噪音文本(如果有) text = re.sub(r'Some unwanted text or pattern', '', text)
# 确保编码为 UTF-8 text = text.encode('utf-8').decode('utf-8')
returntext
def extract_text_from_html(file_path): with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有段落或适当的 HTML 标签 paragraphs = soup.find_all(['p', 'div'])
content = '' for para in paragraphs: text = para.get_text().strip() text = clean_text(text) content += text + '\n' # text = converter.handle(html_content) # soup = BeautifulSoup(html_content, 'html.parser') # text = ' '.join([p.get_text() for p in soup.find_all('p')]) return content
html_directory = '../data/html/' all_texts = []
for filename in tqdm(os.listdir(html_directory)): if filename.endswith('.html'): file_path = os.path.join(html_directory, filename) text = extract_text_from_html(file_path) # all_texts.append(text) all_texts.append({ "text": text }) # break
保存数据
1 2 3 4 5
import json
with open('nyu_data.json', 'w', encoding='utf-8') as file: # 使用 indent=2 来设置每个对象占用单独的行,缩进为 2 个空格 json.dump(all_texts, file, ensure_ascii=False, indent=2)