JsoupHtmlExtractor.java 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. package com.xzl;
  2. import org.jsoup.Jsoup;
  3. import org.jsoup.nodes.Document;
  4. import org.jsoup.safety.Safelist;
  5. import java.io.*;
  6. import java.nio.file.Files;
  7. import java.nio.file.Paths;
  8. import java.util.stream.Stream;
  9. public class JsoupHtmlExtractor {
  10. private static String filePath = "C:\\Users\\GZ\\Desktop\\test.txt";
  11. public static void main(String[] args) {
  12. // String filePath = "example.txt";
  13. // 使用 try-with-resources 确保 BufferedReader 被自动关闭
  14. try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
  15. StringWriter writer = new StringWriter();
  16. String line;
  17. // 按行读取并写入 StringWriter
  18. while ((line = reader.readLine()) != null) {
  19. writer.write(line);
  20. writer.write(System.lineSeparator()); // 写入换行符
  21. }
  22. // 从 StringWriter 中获取完整的字符串
  23. String content = writer.toString();
  24. System.out.println("文件内容:");
  25. System.out.println(content);
  26. // 提取并优化文本
  27. String pureText = extractAndOptimizeWithJsoup(content);
  28. System.out.println("Jsoup优化后的纯文本:");
  29. System.out.println(pureText);
  30. } catch (IOException e) {
  31. System.err.println("读取文件时发生错误: " + e.getMessage());
  32. e.printStackTrace();
  33. }
  34. }
  35. /**
  36. * 用Jsoup提取HTML纯文本并优化
  37. */
  38. public static String extractAndOptimizeWithJsoup(String html) {
  39. if (html == null || html.trim().isEmpty()) {
  40. return "";
  41. }
  42. // 步骤1:解析HTML(忽略文档类型、编码,自动处理嵌套)
  43. Document doc = Jsoup.parse(html);
  44. // 步骤2:提取纯文本(自动移除标签、注释、脚本)
  45. // 方式1:直接获取所有文本(保留段落逻辑,用换行分隔)
  46. // String text = doc.text();
  47. // 方式2:更灵活的过滤(用Safelist.none()表示不保留任何标签,仅文本)
  48. String text = Jsoup.clean(html, Safelist.none());
  49. // 步骤3:优化文本(同原生方案,去多余空白)
  50. text = optimizeText(text);
  51. return text;
  52. }
  53. /**
  54. * 文本优化(复用方案一的逻辑)
  55. */
  56. private static String optimizeText(String text) {
  57. text = text.replaceAll("\\s+", " "); // 连续空白→单个空格
  58. text = text.trim(); // 首尾去空
  59. return text;
  60. }
  61. }