| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- package com.xzl;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.safety.Safelist;
- import java.io.*;
- import java.nio.file.Files;
- import java.nio.file.Paths;
- import java.util.stream.Stream;
- public class JsoupHtmlExtractor {
- private static String filePath = "C:\\Users\\GZ\\Desktop\\test.txt";
- public static void main(String[] args) {
- // String filePath = "example.txt";
- // 使用 try-with-resources 确保 BufferedReader 被自动关闭
- try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
- StringWriter writer = new StringWriter();
- String line;
- // 按行读取并写入 StringWriter
- while ((line = reader.readLine()) != null) {
- writer.write(line);
- writer.write(System.lineSeparator()); // 写入换行符
- }
- // 从 StringWriter 中获取完整的字符串
- String content = writer.toString();
- System.out.println("文件内容:");
- System.out.println(content);
- // 提取并优化文本
- String pureText = extractAndOptimizeWithJsoup(content);
- System.out.println("Jsoup优化后的纯文本:");
- System.out.println(pureText);
- } catch (IOException e) {
- System.err.println("读取文件时发生错误: " + e.getMessage());
- e.printStackTrace();
- }
- }
- /**
- * 用Jsoup提取HTML纯文本并优化
- */
- public static String extractAndOptimizeWithJsoup(String html) {
- if (html == null || html.trim().isEmpty()) {
- return "";
- }
- // 步骤1:解析HTML(忽略文档类型、编码,自动处理嵌套)
- Document doc = Jsoup.parse(html);
- // 步骤2:提取纯文本(自动移除标签、注释、脚本)
- // 方式1:直接获取所有文本(保留段落逻辑,用换行分隔)
- // String text = doc.text();
- // 方式2:更灵活的过滤(用Safelist.none()表示不保留任何标签,仅文本)
- String text = Jsoup.clean(html, Safelist.none());
- // 步骤3:优化文本(同原生方案,去多余空白)
- text = optimizeText(text);
- return text;
- }
- /**
- * 文本优化(复用方案一的逻辑)
- */
- private static String optimizeText(String text) {
- text = text.replaceAll("\\s+", " "); // 连续空白→单个空格
- text = text.trim(); // 首尾去空
- return text;
- }
- }
|