فهرست منبع

1.新增字段,优化后的H5内容(筛掉HTML相关标签)
2.新增展示,点击后新页面展示html格式内容

GongZheng 6 ماه پیش
والد
کامیت
ff3fcc6c56
24فایلهای تغییر یافته به همراه1037 افزوده شده و 2076 حذف شده
  1. 79 0
      xzl-admin/src/main/java/com/xzl/HtmlTextExtractor.java
  2. 83 0
      xzl-admin/src/main/java/com/xzl/JsoupHtmlExtractor.java
  3. 0 105
      xzl-admin/src/main/java/com/xzl/web/controller/SysSpiderAttachmentsController.java
  4. 0 105
      xzl-admin/src/main/java/com/xzl/web/controller/SysSpiderStructuredDataController.java
  5. 0 63
      xzl-admin/src/main/java/com/xzl/web/mapper/SysSpiderAttachmentsMapper.java
  6. 0 62
      xzl-admin/src/main/java/com/xzl/web/mapper/SysSpiderStructuredDataMapper.java
  7. 0 61
      xzl-admin/src/main/java/com/xzl/web/service/ISysSpiderAttachmentsService.java
  8. 0 62
      xzl-admin/src/main/java/com/xzl/web/service/ISysSpiderStructuredDataService.java
  9. 0 95
      xzl-admin/src/main/java/com/xzl/web/service/impl/SysSpiderAttachmentsServiceImpl.java
  10. 3 14
      xzl-admin/src/main/java/com/xzl/web/service/impl/SysSpiderSourceDataServiceImpl.java
  11. 0 95
      xzl-admin/src/main/java/com/xzl/web/service/impl/SysSpiderStructuredDataServiceImpl.java
  12. 36 0
      xzl-admin/src/main/java/com/xzl/web/utils/SeleniumUtils.java
  13. 0 91
      xzl-admin/src/main/resources/mapper/SysSpiderAttachmentsMapper.xml
  14. 6 1
      xzl-admin/src/main/resources/mapper/SysSpiderSourceDataMapper.xml
  15. 0 93
      xzl-admin/src/main/resources/mapper/SysSpiderStructuredDataMapper.xml
  16. 0 166
      xzl-system/src/main/java/com/xzl/system/domain/SysSpiderAttachments.java
  17. 58 39
      xzl-system/src/main/java/com/xzl/system/domain/SysSpiderSourceData.java
  18. 0 167
      xzl-system/src/main/java/com/xzl/system/domain/SysSpiderStructuredData.java
  19. 0 44
      xzl-ui/src/api/spiderData/attachments.js
  20. 0 44
      xzl-ui/src/api/spiderData/structured.js
  21. 0 313
      xzl-ui/src/views/spiderData/attachments/index.vue
  22. 297 99
      xzl-ui/src/views/spiderData/sourceData/index.vue
  23. 475 0
      xzl-ui/src/views/spiderData/sourceData/index.vuebak
  24. 0 357
      xzl-ui/src/views/spiderData/structured/index.vue

+ 79 - 0
xzl-admin/src/main/java/com/xzl/HtmlTextExtractor.java

@@ -0,0 +1,79 @@
+package com.xzl;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class HtmlTextExtractor {
+    // 1. 定义HTML处理相关正则
+    // 匹配所有HTML标签(包括自闭合标签,如<br/>)
+    private static final Pattern HTML_TAG_PATTERN = Pattern.compile("<[^>]+>");
+    // 匹配HTML实体(如&nbsp;、&lt;)
+    private static final Pattern HTML_ENTITY_PATTERN = Pattern.compile("&[#a-zA-Z0-9]+;");
+
+    public static void main(String[] args) {
+        // 测试用HTML内容(含标签、实体、多余空白)
+        String htmlContent = "  <div class=\"article\">\n" +
+                "    <h1>Java提取HTML文本</h1>\n" +
+                "    <p>这是一段带<span style=\"color:red\">标签</span>的文本,&nbsp;包含&nbsp;空格实体。</p>\n" +
+                "    <p>  多余空格、\t制表符、\n换行符都需要优化  </p>\n" +
+                "  </div>";
+
+        // 2. 提取并优化文本
+        String pureText = extractAndOptimizeText(htmlContent);
+
+        // 3. 输出结果
+        System.out.println("优化后的纯文本:");
+        System.out.println(pureText);
+    }
+
+    /**
+     * 提取HTML中的纯文本并优化
+     * @param html 原始HTML字符串
+     * @return 优化后的纯文本
+     */
+    public static String extractAndOptimizeText(String html) {
+        if (html == null || html.trim().isEmpty()) {
+            return "";
+        }
+
+        String text = html;
+
+        // 步骤1:移除所有HTML标签
+        Matcher tagMatcher = HTML_TAG_PATTERN.matcher(text);
+        text = tagMatcher.replaceAll("");
+
+        // 步骤2:替换HTML实体为普通字符(如&nbsp;→空格,&lt;→<)
+        text = replaceHtmlEntities(text);
+
+        // 步骤3:优化文本(去多余空白、首尾空格)
+        text = optimizeText(text);
+
+        return text;
+    }
+
+    /**
+     * 替换常见HTML实体为普通字符
+     */
+    private static String replaceHtmlEntities(String text) {
+        // 常见实体映射(可根据需求扩展,如&copy;→©)
+        return text.replace("&nbsp;", " ")
+                   .replace("&lt;", "<")
+                   .replace("&gt;", ">")
+                   .replace("&amp;", "&")
+                   .replace("&quot;", "\"")
+                   .replace("&apos;", "'");
+    }
+
+    /**
+     * 优化文本:移除多余空白、过滤无意义字符
+     */
+    private static String optimizeText(String text) {
+        // 1. 替换连续空白(空格、换行、制表符)为单个空格
+        text = text.replaceAll("\\s+", " ");
+        // 2. (可选)过滤特殊符号(如★、▌,保留字母/数字/中文)
+        // text = text.replaceAll("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s,.!?;。!?]", "");
+        // 3. 去除首尾空白
+        text = text.trim();
+        return text;
+    }
+}

+ 83 - 0
xzl-admin/src/main/java/com/xzl/JsoupHtmlExtractor.java

@@ -0,0 +1,83 @@
+package com.xzl;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.safety.Safelist;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.stream.Stream;
+
+public class JsoupHtmlExtractor {
+    private static String filePath = "C:\\Users\\GZ\\Desktop\\test.txt";
+
+    public static void main(String[] args) {
+
+
+
+//        String filePath = "example.txt";
+        // 使用 try-with-resources 确保 BufferedReader 被自动关闭
+        try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
+            StringWriter writer = new StringWriter();
+            String line;
+
+            // 按行读取并写入 StringWriter
+            while ((line = reader.readLine()) != null) {
+                writer.write(line);
+                writer.write(System.lineSeparator()); // 写入换行符
+            }
+
+            // 从 StringWriter 中获取完整的字符串
+            String content = writer.toString();
+
+            System.out.println("文件内容:");
+            System.out.println(content);
+
+            // 提取并优化文本
+            String pureText = extractAndOptimizeWithJsoup(content);
+
+            System.out.println("Jsoup优化后的纯文本:");
+            System.out.println(pureText);
+
+        } catch (IOException e) {
+            System.err.println("读取文件时发生错误: " + e.getMessage());
+            e.printStackTrace();
+        }
+    }
+
+
+
+
+    /**
+     * 用Jsoup提取HTML纯文本并优化
+     */
+    public static String extractAndOptimizeWithJsoup(String html) {
+        if (html == null || html.trim().isEmpty()) {
+            return "";
+        }
+
+        // 步骤1:解析HTML(忽略文档类型、编码,自动处理嵌套)
+        Document doc = Jsoup.parse(html);
+
+        // 步骤2:提取纯文本(自动移除标签、注释、脚本)
+        // 方式1:直接获取所有文本(保留段落逻辑,用换行分隔)
+        // String text = doc.text();
+        // 方式2:更灵活的过滤(用Safelist.none()表示不保留任何标签,仅文本)
+        String text = Jsoup.clean(html, Safelist.none());
+
+        // 步骤3:优化文本(同原生方案,去多余空白)
+        text = optimizeText(text);
+
+        return text;
+    }
+
+    /**
+     * 文本优化(复用方案一的逻辑)
+     */
+    private static String optimizeText(String text) {
+        text = text.replaceAll("\\s+", " "); // 连续空白→单个空格
+        text = text.trim(); // 首尾去空
+        return text;
+    }
+}

+ 0 - 105
xzl-admin/src/main/java/com/xzl/web/controller/SysSpiderAttachmentsController.java

@@ -1,105 +0,0 @@
-package com.xzl.web.controller;
-
-import java.util.List;
-import javax.servlet.http.HttpServletResponse;
-
-import com.xzl.system.domain.SysSpiderAttachments;
-import com.xzl.web.service.ISysSpiderAttachmentsService;
-import org.springframework.security.access.prepost.PreAuthorize;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.web.bind.annotation.GetMapping;
-import org.springframework.web.bind.annotation.PostMapping;
-import org.springframework.web.bind.annotation.PutMapping;
-import org.springframework.web.bind.annotation.DeleteMapping;
-import org.springframework.web.bind.annotation.PathVariable;
-import org.springframework.web.bind.annotation.RequestBody;
-import org.springframework.web.bind.annotation.RequestMapping;
-import org.springframework.web.bind.annotation.RestController;
-import com.xzl.common.annotation.Log;
-import com.xzl.common.core.controller.BaseController;
-import com.xzl.common.core.domain.AjaxResult;
-import com.xzl.common.enums.BusinessType;
-import com.xzl.common.utils.poi.ExcelUtil;
-import com.xzl.common.core.page.TableDataInfo;
-
-/**
- * 附件Controller
- *
- * @author xzl
- * @date 2025-11-13
- */
-@RestController
-@RequestMapping("/spiderData/attachments")
-public class SysSpiderAttachmentsController extends BaseController
-{
-    @Autowired
-    private ISysSpiderAttachmentsService sysSpiderAttachmentsService;
-
-    /**
-     * 查询附件列表
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:attachments:list')")
-    @GetMapping("/list")
-    public TableDataInfo list(SysSpiderAttachments sysSpiderAttachments)
-    {
-        startPage();
-        List<SysSpiderAttachments> list = sysSpiderAttachmentsService.selectSysSpiderAttachmentsList(sysSpiderAttachments);
-        return getDataTable(list);
-    }
-
-    /**
-     * 导出附件列表
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:attachments:export')")
-    @Log(title = "附件", businessType = BusinessType.EXPORT)
-    @PostMapping("/export")
-    public void export(HttpServletResponse response, SysSpiderAttachments sysSpiderAttachments)
-    {
-        List<SysSpiderAttachments> list = sysSpiderAttachmentsService.selectSysSpiderAttachmentsList(sysSpiderAttachments);
-        ExcelUtil<SysSpiderAttachments> util = new ExcelUtil<SysSpiderAttachments>(SysSpiderAttachments.class);
-        util.exportExcel(response, list, "附件数据");
-    }
-
-    /**
-     * 获取附件详细信息
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:attachments:query')")
-    @GetMapping(value = "/{id}")
-    public AjaxResult getInfo(@PathVariable("id") Long id)
-    {
-        return success(sysSpiderAttachmentsService.selectSysSpiderAttachmentsById(id));
-    }
-
-    /**
-     * 新增附件
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:attachments:add')")
-    @Log(title = "附件", businessType = BusinessType.INSERT)
-    @PostMapping
-    public AjaxResult add(@RequestBody SysSpiderAttachments sysSpiderAttachments)
-    {
-        return toAjax(sysSpiderAttachmentsService.insertSysSpiderAttachments(sysSpiderAttachments));
-    }
-
-    /**
-     * 修改附件
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:attachments:edit')")
-    @Log(title = "附件", businessType = BusinessType.UPDATE)
-    @PutMapping
-    public AjaxResult edit(@RequestBody SysSpiderAttachments sysSpiderAttachments)
-    {
-        return toAjax(sysSpiderAttachmentsService.updateSysSpiderAttachments(sysSpiderAttachments));
-    }
-
-    /**
-     * 删除附件
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:attachments:remove')")
-    @Log(title = "附件", businessType = BusinessType.DELETE)
-	@DeleteMapping("/{ids}")
-    public AjaxResult remove(@PathVariable Long[] ids)
-    {
-        return toAjax(sysSpiderAttachmentsService.deleteSysSpiderAttachmentsByIds(ids));
-    }
-}

+ 0 - 105
xzl-admin/src/main/java/com/xzl/web/controller/SysSpiderStructuredDataController.java

@@ -1,105 +0,0 @@
-package com.xzl.web.controller;
-
-import java.util.List;
-import javax.servlet.http.HttpServletResponse;
-
-import com.xzl.system.domain.SysSpiderStructuredData;
-import com.xzl.web.service.ISysSpiderStructuredDataService;
-import org.springframework.security.access.prepost.PreAuthorize;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.web.bind.annotation.GetMapping;
-import org.springframework.web.bind.annotation.PostMapping;
-import org.springframework.web.bind.annotation.PutMapping;
-import org.springframework.web.bind.annotation.DeleteMapping;
-import org.springframework.web.bind.annotation.PathVariable;
-import org.springframework.web.bind.annotation.RequestBody;
-import org.springframework.web.bind.annotation.RequestMapping;
-import org.springframework.web.bind.annotation.RestController;
-import com.xzl.common.annotation.Log;
-import com.xzl.common.core.controller.BaseController;
-import com.xzl.common.core.domain.AjaxResult;
-import com.xzl.common.enums.BusinessType;
-import com.xzl.common.utils.poi.ExcelUtil;
-import com.xzl.common.core.page.TableDataInfo;
-
-/**
- * 结构化数据Controller
- *
- * @author xzl
- * @date 2025-11-13
- */
-@RestController
-@RequestMapping("/spiderData/structured")
-public class SysSpiderStructuredDataController extends BaseController
-{
-    @Autowired
-    private ISysSpiderStructuredDataService sysSpiderStructuredDataService;
-
-    /**
-     * 查询结构化数据列表
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:structured:list')")
-    @GetMapping("/list")
-    public TableDataInfo list(SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        startPage();
-        List<SysSpiderStructuredData> list = sysSpiderStructuredDataService.selectSysSpiderStructuredDataList(sysSpiderStructuredData);
-        return getDataTable(list);
-    }
-
-    /**
-     * 导出结构化数据列表
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:structured:export')")
-    @Log(title = "结构化数据", businessType = BusinessType.EXPORT)
-    @PostMapping("/export")
-    public void export(HttpServletResponse response, SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        List<SysSpiderStructuredData> list = sysSpiderStructuredDataService.selectSysSpiderStructuredDataList(sysSpiderStructuredData);
-        ExcelUtil<SysSpiderStructuredData> util = new ExcelUtil<SysSpiderStructuredData>(SysSpiderStructuredData.class);
-        util.exportExcel(response, list, "结构化数据数据");
-    }
-
-    /**
-     * 获取结构化数据详细信息
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:structured:query')")
-    @GetMapping(value = "/{id}")
-    public AjaxResult getInfo(@PathVariable("id") Long id)
-    {
-        return success(sysSpiderStructuredDataService.selectSysSpiderStructuredDataById(id));
-    }
-
-    /**
-     * 新增结构化数据
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:structured:add')")
-    @Log(title = "结构化数据", businessType = BusinessType.INSERT)
-    @PostMapping
-    public AjaxResult add(@RequestBody SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        return toAjax(sysSpiderStructuredDataService.insertSysSpiderStructuredData(sysSpiderStructuredData));
-    }
-
-    /**
-     * 修改结构化数据
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:structured:edit')")
-    @Log(title = "结构化数据", businessType = BusinessType.UPDATE)
-    @PutMapping
-    public AjaxResult edit(@RequestBody SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        return toAjax(sysSpiderStructuredDataService.updateSysSpiderStructuredData(sysSpiderStructuredData));
-    }
-
-    /**
-     * 删除结构化数据
-     */
-    @PreAuthorize("@ss.hasPermi('spiderData:structured:remove')")
-    @Log(title = "结构化数据", businessType = BusinessType.DELETE)
-	@DeleteMapping("/{ids}")
-    public AjaxResult remove(@PathVariable Long[] ids)
-    {
-        return toAjax(sysSpiderStructuredDataService.deleteSysSpiderStructuredDataByIds(ids));
-    }
-}

+ 0 - 63
xzl-admin/src/main/java/com/xzl/web/mapper/SysSpiderAttachmentsMapper.java

@@ -1,63 +0,0 @@
-package com.xzl.web.mapper;
-
-
-import com.xzl.system.domain.SysSpiderAttachments;
-
-import java.util.List;
-
-/**
- * 附件Mapper接口
- *
- * @author xzl
- * @date 2025-11-13
- */
-public interface SysSpiderAttachmentsMapper
-{
-    /**
-     * 查询附件
-     *
-     * @param id 附件主键
-     * @return 附件
-     */
-    public SysSpiderAttachments selectSysSpiderAttachmentsById(Long id);
-
-    /**
-     * 查询附件列表
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 附件集合
-     */
-    public List<SysSpiderAttachments> selectSysSpiderAttachmentsList(SysSpiderAttachments sysSpiderAttachments);
-
-    /**
-     * 新增附件
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 结果
-     */
-    public int insertSysSpiderAttachments(SysSpiderAttachments sysSpiderAttachments);
-
-    /**
-     * 修改附件
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 结果
-     */
-    public int updateSysSpiderAttachments(SysSpiderAttachments sysSpiderAttachments);
-
-    /**
-     * 删除附件
-     *
-     * @param id 附件主键
-     * @return 结果
-     */
-    public int deleteSysSpiderAttachmentsById(Long id);
-
-    /**
-     * 批量删除附件
-     *
-     * @param ids 需要删除的数据主键集合
-     * @return 结果
-     */
-    public int deleteSysSpiderAttachmentsByIds(Long[] ids);
-}

+ 0 - 62
xzl-admin/src/main/java/com/xzl/web/mapper/SysSpiderStructuredDataMapper.java

@@ -1,62 +0,0 @@
-package com.xzl.web.mapper;
-
-import java.util.List;
-
-import com.xzl.system.domain.SysSpiderStructuredData;
-
-/**
- * 结构化数据Mapper接口
- *
- * @author xzl
- * @date 2025-11-13
- */
-public interface SysSpiderStructuredDataMapper
-{
-    /**
-     * 查询结构化数据
-     *
-     * @param id 结构化数据主键
-     * @return 结构化数据
-     */
-    public SysSpiderStructuredData selectSysSpiderStructuredDataById(Long id);
-
-    /**
-     * 查询结构化数据列表
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结构化数据集合
-     */
-    public List<SysSpiderStructuredData> selectSysSpiderStructuredDataList(SysSpiderStructuredData sysSpiderStructuredData);
-
-    /**
-     * 新增结构化数据
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结果
-     */
-    public int insertSysSpiderStructuredData(SysSpiderStructuredData sysSpiderStructuredData);
-
-    /**
-     * 修改结构化数据
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结果
-     */
-    public int updateSysSpiderStructuredData(SysSpiderStructuredData sysSpiderStructuredData);
-
-    /**
-     * 删除结构化数据
-     *
-     * @param id 结构化数据主键
-     * @return 结果
-     */
-    public int deleteSysSpiderStructuredDataById(Long id);
-
-    /**
-     * 批量删除结构化数据
-     *
-     * @param ids 需要删除的数据主键集合
-     * @return 结果
-     */
-    public int deleteSysSpiderStructuredDataByIds(Long[] ids);
-}

+ 0 - 61
xzl-admin/src/main/java/com/xzl/web/service/ISysSpiderAttachmentsService.java

@@ -1,61 +0,0 @@
-package com.xzl.web.service;
-
-import java.util.List;
-import com.xzl.system.domain.SysSpiderAttachments;
-
-/**
- * 附件Service接口
- *
- * @author xzl
- * @date 2025-11-13
- */
-public interface ISysSpiderAttachmentsService
-{
-    /**
-     * 查询附件
-     *
-     * @param id 附件主键
-     * @return 附件
-     */
-    public SysSpiderAttachments selectSysSpiderAttachmentsById(Long id);
-
-    /**
-     * 查询附件列表
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 附件集合
-     */
-    public List<SysSpiderAttachments> selectSysSpiderAttachmentsList(SysSpiderAttachments sysSpiderAttachments);
-
-    /**
-     * 新增附件
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 结果
-     */
-    public int insertSysSpiderAttachments(SysSpiderAttachments sysSpiderAttachments);
-
-    /**
-     * 修改附件
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 结果
-     */
-    public int updateSysSpiderAttachments(SysSpiderAttachments sysSpiderAttachments);
-
-    /**
-     * 批量删除附件
-     *
-     * @param ids 需要删除的附件主键集合
-     * @return 结果
-     */
-    public int deleteSysSpiderAttachmentsByIds(Long[] ids);
-
-    /**
-     * 删除附件信息
-     *
-     * @param id 附件主键
-     * @return 结果
-     */
-    public int deleteSysSpiderAttachmentsById(Long id);
-}

+ 0 - 62
xzl-admin/src/main/java/com/xzl/web/service/ISysSpiderStructuredDataService.java

@@ -1,62 +0,0 @@
-package com.xzl.web.service;
-
-import java.util.List;
-
-import com.xzl.system.domain.SysSpiderStructuredData;
-
-/**
- * 结构化数据Service接口
- *
- * @author xzl
- * @date 2025-11-13
- */
-public interface ISysSpiderStructuredDataService
-{
-    /**
-     * 查询结构化数据
-     *
-     * @param id 结构化数据主键
-     * @return 结构化数据
-     */
-    public SysSpiderStructuredData selectSysSpiderStructuredDataById(Long id);
-
-    /**
-     * 查询结构化数据列表
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结构化数据集合
-     */
-    public List<SysSpiderStructuredData> selectSysSpiderStructuredDataList(SysSpiderStructuredData sysSpiderStructuredData);
-
-    /**
-     * 新增结构化数据
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结果
-     */
-    public int insertSysSpiderStructuredData(SysSpiderStructuredData sysSpiderStructuredData);
-
-    /**
-     * 修改结构化数据
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结果
-     */
-    public int updateSysSpiderStructuredData(SysSpiderStructuredData sysSpiderStructuredData);
-
-    /**
-     * 批量删除结构化数据
-     *
-     * @param ids 需要删除的结构化数据主键集合
-     * @return 结果
-     */
-    public int deleteSysSpiderStructuredDataByIds(Long[] ids);
-
-    /**
-     * 删除结构化数据信息
-     *
-     * @param id 结构化数据主键
-     * @return 结果
-     */
-    public int deleteSysSpiderStructuredDataById(Long id);
-}

+ 0 - 95
xzl-admin/src/main/java/com/xzl/web/service/impl/SysSpiderAttachmentsServiceImpl.java

@@ -1,95 +0,0 @@
-package com.xzl.web.service.impl;
-
-import java.util.List;
-
-import com.xzl.system.domain.SysSpiderAttachments;
-import com.xzl.web.mapper.SysSpiderAttachmentsMapper;
-import com.xzl.web.service.ISysSpiderAttachmentsService;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Service;
-
-
-/**
- * 附件Service业务层处理
- *
- * @author xzl
- * @date 2025-11-13
- */
-@Service
-public class SysSpiderAttachmentsServiceImpl implements ISysSpiderAttachmentsService
-{
-    @Autowired
-    private SysSpiderAttachmentsMapper sysSpiderAttachmentsMapper;
-
-    /**
-     * 查询附件
-     *
-     * @param id 附件主键
-     * @return 附件
-     */
-    @Override
-    public SysSpiderAttachments selectSysSpiderAttachmentsById(Long id)
-    {
-        return sysSpiderAttachmentsMapper.selectSysSpiderAttachmentsById(id);
-    }
-
-    /**
-     * 查询附件列表
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 附件
-     */
-    @Override
-    public List<SysSpiderAttachments> selectSysSpiderAttachmentsList(SysSpiderAttachments sysSpiderAttachments)
-    {
-        return sysSpiderAttachmentsMapper.selectSysSpiderAttachmentsList(sysSpiderAttachments);
-    }
-
-    /**
-     * 新增附件
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 结果
-     */
-    @Override
-    public int insertSysSpiderAttachments(SysSpiderAttachments sysSpiderAttachments)
-    {
-        return sysSpiderAttachmentsMapper.insertSysSpiderAttachments(sysSpiderAttachments);
-    }
-
-    /**
-     * 修改附件
-     *
-     * @param sysSpiderAttachments 附件
-     * @return 结果
-     */
-    @Override
-    public int updateSysSpiderAttachments(SysSpiderAttachments sysSpiderAttachments)
-    {
-        return sysSpiderAttachmentsMapper.updateSysSpiderAttachments(sysSpiderAttachments);
-    }
-
-    /**
-     * 批量删除附件
-     *
-     * @param ids 需要删除的附件主键
-     * @return 结果
-     */
-    @Override
-    public int deleteSysSpiderAttachmentsByIds(Long[] ids)
-    {
-        return sysSpiderAttachmentsMapper.deleteSysSpiderAttachmentsByIds(ids);
-    }
-
-    /**
-     * 删除附件信息
-     *
-     * @param id 附件主键
-     * @return 结果
-     */
-    @Override
-    public int deleteSysSpiderAttachmentsById(Long id)
-    {
-        return sysSpiderAttachmentsMapper.deleteSysSpiderAttachmentsById(id);
-    }
-}

+ 3 - 14
xzl-admin/src/main/java/com/xzl/web/service/impl/SysSpiderSourceDataServiceImpl.java

@@ -102,20 +102,6 @@ public class SysSpiderSourceDataServiceImpl implements ISysSpiderSourceDataServi
             // 1. 打印采集日志(便于调试)
             log.info("开始采集页面:{}", pageUrl);
 
-            // 2. 核心采集逻辑(根据需求实现)
-            // 示例:调用 HTTP 工具获取页面内容(可使用 HttpClient、OkHttp 等)
-            // String pageContent = httpClient.get(pageUrl);
-
-            // 3. 数据解析(如 HTML 解析、提取需要的内容)
-            // 示例:使用 Jsoup 解析 HTML
-            // Document doc = Jsoup.parse(pageContent);
-            // String title = doc.title(); // 提取页面标题
-            // String content = doc.select("body").text(); // 提取页面正文
-
-            // 4. 数据入库(保存到数据库,需注入 Mapper/Repository)
-            // 示例:spiderDataMapper.insert(new SpiderData(pageUrl, title, content));
-
-
             System.setProperty("webdriver.chrome.driver", "D:\\chromedriver.exe");
 
             // 浏览器配置(无头模式、禁用图片、设置超时)
@@ -137,6 +123,7 @@ public class SysSpiderSourceDataServiceImpl implements ISysSpiderSourceDataServi
             sysSpiderSourceData.setRawContent(htmlContent);
             sysSpiderSourceData.setRawAttachments(getHtmlFileLinkAndName(htmlContent));
             sysSpiderSourceData.setTaskId(UUID.randomUUID().toString());
+            sysSpiderSourceData.setRawContentFilter(SeleniumUtils.extractAndOptimizeWithJsoup(htmlContent));
             sysSpiderSourceDataMapper.insertSysSpiderSourceData(sysSpiderSourceData);
             // 5. 采集完成日志
             log.info("页面采集完成:{}", pageUrl);
@@ -149,6 +136,8 @@ public class SysSpiderSourceDataServiceImpl implements ISysSpiderSourceDataServi
         }
     }
 
+
+
     public String getHtmlFileLinkAndName(String htmlContent) {
         ArrayList<String> attrCollections = new ArrayList<>();
         attrCollections.add(".pdf");

+ 0 - 95
xzl-admin/src/main/java/com/xzl/web/service/impl/SysSpiderStructuredDataServiceImpl.java

@@ -1,95 +0,0 @@
-package com.xzl.web.service.impl;
-
-import com.xzl.system.domain.SysSpiderStructuredData;
-import com.xzl.web.mapper.SysSpiderStructuredDataMapper;
-import com.xzl.web.service.ISysSpiderStructuredDataService;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Service;
-
-import java.util.List;
-
-
-/**
- * 结构化数据Service业务层处理
- *
- * @author xzl
- * @date 2025-11-13
- */
-@Service
-public class SysSpiderStructuredDataServiceImpl implements ISysSpiderStructuredDataService
-{
-    @Autowired
-    private SysSpiderStructuredDataMapper sysSpiderStructuredDataMapper;
-
-    /**
-     * 查询结构化数据
-     *
-     * @param id 结构化数据主键
-     * @return 结构化数据
-     */
-    @Override
-    public SysSpiderStructuredData selectSysSpiderStructuredDataById(Long id)
-    {
-        return sysSpiderStructuredDataMapper.selectSysSpiderStructuredDataById(id);
-    }
-
-    /**
-     * 查询结构化数据列表
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结构化数据
-     */
-    @Override
-    public List<SysSpiderStructuredData> selectSysSpiderStructuredDataList(SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        return sysSpiderStructuredDataMapper.selectSysSpiderStructuredDataList(sysSpiderStructuredData);
-    }
-
-    /**
-     * 新增结构化数据
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结果
-     */
-    @Override
-    public int insertSysSpiderStructuredData(SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        return sysSpiderStructuredDataMapper.insertSysSpiderStructuredData(sysSpiderStructuredData);
-    }
-
-    /**
-     * 修改结构化数据
-     *
-     * @param sysSpiderStructuredData 结构化数据
-     * @return 结果
-     */
-    @Override
-    public int updateSysSpiderStructuredData(SysSpiderStructuredData sysSpiderStructuredData)
-    {
-        return sysSpiderStructuredDataMapper.updateSysSpiderStructuredData(sysSpiderStructuredData);
-    }
-
-    /**
-     * 批量删除结构化数据
-     *
-     * @param ids 需要删除的结构化数据主键
-     * @return 结果
-     */
-    @Override
-    public int deleteSysSpiderStructuredDataByIds(Long[] ids)
-    {
-        return sysSpiderStructuredDataMapper.deleteSysSpiderStructuredDataByIds(ids);
-    }
-
-    /**
-     * 删除结构化数据信息
-     *
-     * @param id 结构化数据主键
-     * @return 结果
-     */
-    @Override
-    public int deleteSysSpiderStructuredDataById(Long id)
-    {
-        return sysSpiderStructuredDataMapper.deleteSysSpiderStructuredDataById(id);
-    }
-}

+ 36 - 0
xzl-admin/src/main/java/com/xzl/web/utils/SeleniumUtils.java

@@ -2,6 +2,9 @@ package com.xzl.web.utils;
 
 
 import lombok.extern.slf4j.Slf4j;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.safety.Safelist;
 import org.openqa.selenium.WebDriver;
 import org.openqa.selenium.chrome.ChromeDriver;
 import org.openqa.selenium.chrome.ChromeOptions;
@@ -81,4 +84,37 @@ public class SeleniumUtils {
             }
         }
     }
+
+
+    /**
+     * 用Jsoup提取HTML纯文本并优化
+     */
+    public static String extractAndOptimizeWithJsoup(String html) {
+        if (html == null || html.trim().isEmpty()) {
+            return "";
+        }
+
+        // 步骤1:解析HTML(忽略文档类型、编码,自动处理嵌套)
+        Document doc = Jsoup.parse(html);
+
+        // 步骤2:提取纯文本(自动移除标签、注释、脚本)
+        // 方式1:直接获取所有文本(保留段落逻辑,用换行分隔)
+        // String text = doc.text();
+        // 方式2:更灵活的过滤(用Safelist.none()表示不保留任何标签,仅文本)
+        String text = Jsoup.clean(html, Safelist.none());
+
+        // 步骤3:优化文本(同原生方案,去多余空白)
+        text = optimizeText(text);
+
+        return text;
+    }
+
+    /**
+     * 文本优化(复用方案一的逻辑)
+     */
+    private static String optimizeText(String text) {
+        text = text.replaceAll("\\s+", " "); // 连续空白→单个空格
+        text = text.trim(); // 首尾去空
+        return text;
+    }
 }

+ 0 - 91
xzl-admin/src/main/resources/mapper/SysSpiderAttachmentsMapper.xml

@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!DOCTYPE mapper
-PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
-"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
-<mapper namespace="com.xzl.web.mapper.SysSpiderAttachmentsMapper">
-
-    <resultMap type="SysSpiderAttachments" id="SysSpiderAttachmentsResult">
-        <result property="id"    column="id"    />
-        <result property="rawDataId"    column="raw_data_id"    />
-        <result property="structuredDataId"    column="structured_data_id"    />
-        <result property="fileName"    column="file_name"    />
-        <result property="fileType"    column="file_type"    />
-        <result property="fileSize"    column="file_size"    />
-        <result property="originalUrl"    column="original_url"    />
-        <result property="localPath"    column="local_path"    />
-        <result property="downloadTime"    column="download_time"    />
-        <result property="taskId"    column="task_id"    />
-    </resultMap>
-
-    <sql id="selectSysSpiderAttachmentsVo">
-        select id, raw_data_id, structured_data_id, file_name, file_type, file_size, original_url, local_path, download_time, task_id from sys_spider_attachments
-    </sql>
-
-    <select id="selectSysSpiderAttachmentsList" parameterType="SysSpiderAttachments" resultMap="SysSpiderAttachmentsResult">
-        <include refid="selectSysSpiderAttachmentsVo"/>
-        <where>
-            <if test="fileName != null  and fileName != ''"> and file_name like concat('%', #{fileName}, '%')</if>
-            <if test="fileType != null  and fileType != ''"> and file_type = #{fileType}</if>
-            <if test="originalUrl != null  and originalUrl != ''"> and original_url like concat('%', #{originalUrl}, '%')</if>
-            <if test="taskId != null  and taskId != ''"> and task_id = #{taskId}</if>
-        </where>
-    </select>
-
-    <select id="selectSysSpiderAttachmentsById" parameterType="Long" resultMap="SysSpiderAttachmentsResult">
-        <include refid="selectSysSpiderAttachmentsVo"/>
-        where id = #{id}
-    </select>
-
-    <insert id="insertSysSpiderAttachments" parameterType="SysSpiderAttachments" useGeneratedKeys="true" keyProperty="id">
-        insert into sys_spider_attachments
-        <trim prefix="(" suffix=")" suffixOverrides=",">
-            <if test="rawDataId != null">raw_data_id,</if>
-            <if test="structuredDataId != null">structured_data_id,</if>
-            <if test="fileName != null">file_name,</if>
-            <if test="fileType != null">file_type,</if>
-            <if test="fileSize != null">file_size,</if>
-            <if test="originalUrl != null">original_url,</if>
-            <if test="localPath != null">local_path,</if>
-            <if test="downloadTime != null">download_time,</if>
-            <if test="taskId != null">task_id,</if>
-         </trim>
-        <trim prefix="values (" suffix=")" suffixOverrides=",">
-            <if test="rawDataId != null">#{rawDataId},</if>
-            <if test="structuredDataId != null">#{structuredDataId},</if>
-            <if test="fileName != null">#{fileName},</if>
-            <if test="fileType != null">#{fileType},</if>
-            <if test="fileSize != null">#{fileSize},</if>
-            <if test="originalUrl != null">#{originalUrl},</if>
-            <if test="localPath != null">#{localPath},</if>
-            <if test="downloadTime != null">#{downloadTime},</if>
-            <if test="taskId != null">#{taskId},</if>
-         </trim>
-    </insert>
-
-    <update id="updateSysSpiderAttachments" parameterType="SysSpiderAttachments">
-        update sys_spider_attachments
-        <trim prefix="SET" suffixOverrides=",">
-            <if test="rawDataId != null">raw_data_id = #{rawDataId},</if>
-            <if test="structuredDataId != null">structured_data_id = #{structuredDataId},</if>
-            <if test="fileName != null">file_name = #{fileName},</if>
-            <if test="fileType != null">file_type = #{fileType},</if>
-            <if test="fileSize != null">file_size = #{fileSize},</if>
-            <if test="originalUrl != null">original_url = #{originalUrl},</if>
-            <if test="localPath != null">local_path = #{localPath},</if>
-            <if test="downloadTime != null">download_time = #{downloadTime},</if>
-            <if test="taskId != null">task_id = #{taskId},</if>
-        </trim>
-        where id = #{id}
-    </update>
-
-    <delete id="deleteSysSpiderAttachmentsById" parameterType="Long">
-        delete from sys_spider_attachments where id = #{id}
-    </delete>
-
-    <delete id="deleteSysSpiderAttachmentsByIds" parameterType="String">
-        delete from sys_spider_attachments where id in
-        <foreach item="id" collection="array" open="(" separator="," close=")">
-            #{id}
-        </foreach>
-    </delete>
-</mapper>

+ 6 - 1
xzl-admin/src/main/resources/mapper/SysSpiderSourceDataMapper.xml

@@ -8,13 +8,14 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
         <result property="id"    column="id"    />
         <result property="pageUrl"    column="page_url"    />
         <result property="rawContent"    column="raw_content"    />
+        <result property="rawContentFilter"    column="raw_content_filter"    />
         <result property="rawAttachments"    column="raw_attachments"    />
         <result property="collectionTime"    column="collection_time"    />
         <result property="taskId"    column="task_id"    />
     </resultMap>
 
     <sql id="selectSysSpiderSourceDataVo">
-        select id, page_url, raw_content, raw_attachments, collection_time, task_id from sys_spider_source_data
+        select id, page_url, raw_content, raw_content_filter,raw_attachments, collection_time, task_id from sys_spider_source_data
     </sql>
 
     <select id="selectSysSpiderSourceDataList" parameterType="SysSpiderSourceData" resultMap="SysSpiderSourceDataResult">
@@ -22,6 +23,7 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
         <where>
             <if test="pageUrl != null  and pageUrl != ''"> and page_url like concat('%', #{pageUrl}, '%')</if>
             <if test="rawContent != null  and rawContent != ''"> and raw_content like concat('%', #{rawContent}, '%')</if>
+            <if test="rawContentFilter != null  and rawContentFilter != ''"> and raw_content_filter like concat('%', #{rawContentFilter}, '%')</if>
             <if test="rawAttachments != null  and rawAttachments != ''"> and raw_attachments like concat('%', #{rawAttachments}, '%')</if>
             <if test="params.beginCollectionTime != null and params.beginCollectionTime != '' and params.endCollectionTime != null and params.endCollectionTime != ''"> and collection_time between #{params.beginCollectionTime} and #{params.endCollectionTime}</if>
             <if test="taskId != null  and taskId != ''"> and task_id = #{taskId}</if>
@@ -38,6 +40,7 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
         <trim prefix="(" suffix=")" suffixOverrides=",">
             <if test="pageUrl != null">page_url,</if>
             <if test="rawContent != null">raw_content,</if>
+            <if test="rawContentFilter != null">raw_content_filter,</if>
             <if test="rawAttachments != null">raw_attachments,</if>
             <if test="collectionTime != null">collection_time,</if>
             <if test="taskId != null">task_id,</if>
@@ -45,6 +48,7 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
         <trim prefix="values (" suffix=")" suffixOverrides=",">
             <if test="pageUrl != null">#{pageUrl},</if>
             <if test="rawContent != null">#{rawContent},</if>
+            <if test="rawContentFilter != null">#{rawContentFilter},</if>
             <if test="rawAttachments != null">#{rawAttachments},</if>
             <if test="collectionTime != null">#{collectionTime},</if>
             <if test="taskId != null">#{taskId},</if>
@@ -56,6 +60,7 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
         <trim prefix="SET" suffixOverrides=",">
             <if test="pageUrl != null">page_url = #{pageUrl},</if>
             <if test="rawContent != null">raw_content = #{rawContent},</if>
+            <if test="rawContentFilter != null">raw_content_filter = #{rawContentFilter},</if>
             <if test="rawAttachments != null">raw_attachments = #{rawAttachments},</if>
             <if test="collectionTime != null">collection_time = #{collectionTime},</if>
             <if test="taskId != null">task_id = #{taskId},</if>

+ 0 - 93
xzl-admin/src/main/resources/mapper/SysSpiderStructuredDataMapper.xml

@@ -1,93 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!DOCTYPE mapper
-PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
-"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
-<mapper namespace="com.xzl.web.mapper.SysSpiderStructuredDataMapper">
-
-    <resultMap type="SysSpiderStructuredData" id="SysSpiderStructuredDataResult">
-        <result property="id"    column="id"    />
-        <result property="rawDataId"    column="raw_data_id"    />
-        <result property="title"    column="title"    />
-        <result property="content"    column="content"    />
-        <result property="author"    column="author"    />
-        <result property="department"    column="department"    />
-        <result property="publishTime"    column="publish_time"    />
-        <result property="keywords"    column="keywords"    />
-        <result property="customFields"    column="custom_fields"    />
-        <result property="processTime"    column="process_time"    />
-    </resultMap>
-
-    <sql id="selectSysSpiderStructuredDataVo">
-        select id, raw_data_id, title, content, author, department, publish_time, keywords, custom_fields, process_time from sys_spider_structured_data
-    </sql>
-
-    <select id="selectSysSpiderStructuredDataList" parameterType="SysSpiderStructuredData" resultMap="SysSpiderStructuredDataResult">
-        <include refid="selectSysSpiderStructuredDataVo"/>
-        <where>
-            <if test="title != null  and title != ''"> and title = #{title}</if>
-            <if test="content != null  and content != ''"> and content = #{content}</if>
-            <if test="author != null  and author != ''"> and author = #{author}</if>
-            <if test="department != null  and department != ''"> and department = #{department}</if>
-            <if test="params.beginPublishTime != null and params.beginPublishTime != '' and params.endPublishTime != null and params.endPublishTime != ''"> and publish_time between #{params.beginPublishTime} and #{params.endPublishTime}</if>
-            <if test="keywords != null  and keywords != ''"> and keywords = #{keywords}</if>
-        </where>
-    </select>
-
-    <select id="selectSysSpiderStructuredDataById" parameterType="Long" resultMap="SysSpiderStructuredDataResult">
-        <include refid="selectSysSpiderStructuredDataVo"/>
-        where id = #{id}
-    </select>
-
-    <insert id="insertSysSpiderStructuredData" parameterType="SysSpiderStructuredData" useGeneratedKeys="true" keyProperty="id">
-        insert into sys_spider_structured_data
-        <trim prefix="(" suffix=")" suffixOverrides=",">
-            <if test="rawDataId != null">raw_data_id,</if>
-            <if test="title != null">title,</if>
-            <if test="content != null">content,</if>
-            <if test="author != null">author,</if>
-            <if test="department != null">department,</if>
-            <if test="publishTime != null">publish_time,</if>
-            <if test="keywords != null">keywords,</if>
-            <if test="customFields != null">custom_fields,</if>
-            <if test="processTime != null">process_time,</if>
-         </trim>
-        <trim prefix="values (" suffix=")" suffixOverrides=",">
-            <if test="rawDataId != null">#{rawDataId},</if>
-            <if test="title != null">#{title},</if>
-            <if test="content != null">#{content},</if>
-            <if test="author != null">#{author},</if>
-            <if test="department != null">#{department},</if>
-            <if test="publishTime != null">#{publishTime},</if>
-            <if test="keywords != null">#{keywords},</if>
-            <if test="customFields != null">#{customFields},</if>
-            <if test="processTime != null">#{processTime},</if>
-         </trim>
-    </insert>
-
-    <update id="updateSysSpiderStructuredData" parameterType="SysSpiderStructuredData">
-        update sys_spider_structured_data
-        <trim prefix="SET" suffixOverrides=",">
-            <if test="rawDataId != null">raw_data_id = #{rawDataId},</if>
-            <if test="title != null">title = #{title},</if>
-            <if test="content != null">content = #{content},</if>
-            <if test="author != null">author = #{author},</if>
-            <if test="department != null">department = #{department},</if>
-            <if test="publishTime != null">publish_time = #{publishTime},</if>
-            <if test="keywords != null">keywords = #{keywords},</if>
-            <if test="customFields != null">custom_fields = #{customFields},</if>
-            <if test="processTime != null">process_time = #{processTime},</if>
-        </trim>
-        where id = #{id}
-    </update>
-
-    <delete id="deleteSysSpiderStructuredDataById" parameterType="Long">
-        delete from sys_spider_structured_data where id = #{id}
-    </delete>
-
-    <delete id="deleteSysSpiderStructuredDataByIds" parameterType="String">
-        delete from sys_spider_structured_data where id in
-        <foreach item="id" collection="array" open="(" separator="," close=")">
-            #{id}
-        </foreach>
-    </delete>
-</mapper>

+ 0 - 166
xzl-system/src/main/java/com/xzl/system/domain/SysSpiderAttachments.java

@@ -1,166 +0,0 @@
-package com.xzl.system.domain;
-
-import java.util.Date;
-import com.fasterxml.jackson.annotation.JsonFormat;
-import com.xzl.common.annotation.Excel;
-import com.xzl.common.core.domain.BaseEntity;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
-
-/**
- * 附件对象 sys_spider_attachments
- *
- * @author xzl
- * @date 2025-11-13
- */
-public class SysSpiderAttachments extends BaseEntity
-{
-    private static final long serialVersionUID = 1L;
-
-    /** 附件ID */
-    private Long id;
-
-    /** 关联原始数据ID */
-    @Excel(name = "关联原始数据ID")
-    private Long rawDataId;
-
-    /** 关联结构化数据ID */
-    @Excel(name = "关联结构化数据ID")
-    private Long structuredDataId;
-
-    /** 附件文件名 */
-    @Excel(name = "附件文件名")
-    private String fileName;
-
-    /** 文件类型(pdf/docx/xlsx等) */
-    @Excel(name = "文件类型", readConverterExp = "p=df/docx/xlsx等")
-    private String fileType;
-
-    /** 文件大小(字节) */
-    @Excel(name = "文件大小", readConverterExp = "字=节")
-    private Long fileSize;
-
-    /** OA系统原始URL */
-    @Excel(name = "OA系统原始URL")
-    private String originalUrl;
-
-    /** 本地存储路径 */
-    @Excel(name = "本地存储路径")
-    private String localPath;
-
-    /** 下载时间 */
-    @JsonFormat(pattern = "yyyy-MM-dd")
-    @Excel(name = "下载时间", width = 30, dateFormat = "yyyy-MM-dd")
-    private Date downloadTime;
-
-    /** 关联采集任务ID */
-    @Excel(name = "关联采集任务ID")
-    private String taskId;
-
-    public void setId(Long id)
-    {
-        this.id = id;
-    }
-
-    public Long getId()
-    {
-        return id;
-    }
-    public void setRawDataId(Long rawDataId)
-    {
-        this.rawDataId = rawDataId;
-    }
-
-    public Long getRawDataId()
-    {
-        return rawDataId;
-    }
-    public void setStructuredDataId(Long structuredDataId)
-    {
-        this.structuredDataId = structuredDataId;
-    }
-
-    public Long getStructuredDataId()
-    {
-        return structuredDataId;
-    }
-    public void setFileName(String fileName)
-    {
-        this.fileName = fileName;
-    }
-
-    public String getFileName()
-    {
-        return fileName;
-    }
-    public void setFileType(String fileType)
-    {
-        this.fileType = fileType;
-    }
-
-    public String getFileType()
-    {
-        return fileType;
-    }
-    public void setFileSize(Long fileSize)
-    {
-        this.fileSize = fileSize;
-    }
-
-    public Long getFileSize()
-    {
-        return fileSize;
-    }
-    public void setOriginalUrl(String originalUrl)
-    {
-        this.originalUrl = originalUrl;
-    }
-
-    public String getOriginalUrl()
-    {
-        return originalUrl;
-    }
-    public void setLocalPath(String localPath)
-    {
-        this.localPath = localPath;
-    }
-
-    public String getLocalPath()
-    {
-        return localPath;
-    }
-    public void setDownloadTime(Date downloadTime)
-    {
-        this.downloadTime = downloadTime;
-    }
-
-    public Date getDownloadTime()
-    {
-        return downloadTime;
-    }
-    public void setTaskId(String taskId)
-    {
-        this.taskId = taskId;
-    }
-
-    public String getTaskId()
-    {
-        return taskId;
-    }
-
-    @Override
-    public String toString() {
-        return new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE)
-            .append("id", getId())
-            .append("rawDataId", getRawDataId())
-            .append("structuredDataId", getStructuredDataId())
-            .append("fileName", getFileName())
-            .append("fileType", getFileType())
-            .append("fileSize", getFileSize())
-            .append("originalUrl", getOriginalUrl())
-            .append("localPath", getLocalPath())
-            .append("downloadTime", getDownloadTime())
-            .append("taskId", getTaskId())
-            .toString();
-    }
-}

+ 58 - 39
xzl-system/src/main/java/com/xzl/system/domain/SysSpiderSourceData.java

@@ -14,99 +14,118 @@ import java.util.Date;
  * @author xzl
  * @date 2025-11-13
  */
-public class SysSpiderSourceData extends BaseEntity
-{
+public class SysSpiderSourceData extends BaseEntity {
     private static final long serialVersionUID = 1L;
 
-    /** 原始数据ID */
+    /**
+     * 原始数据ID
+     */
     private Long id;
 
-    /** 采集页面URL */
+    /**
+     * 采集页面URL
+     */
     @Excel(name = "采集页面URL")
     private String pageUrl;
 
-    /** 原始文本内容(HTML/JSON等) */
+    /**
+     * 原始文本内容(HTML/JSON等)
+     */
     @Excel(name = "原始文本内容", readConverterExp = "H=TML/JSON等")
     private String rawContent;
 
-    /** 原始附件信息(JSON格式:文件名、URL等) */
+    /**
+     * 原始文本内容(HTML/JSON等)
+     */
+    @Excel(name = "原始文本内容", readConverterExp = "H=TML/JSON等")
+    private String rawContentFilter;
+
+    /**
+     * 原始附件信息(JSON格式:文件名、URL等)
+     */
     @Excel(name = "原始附件信息", readConverterExp = "J=SON格式:文件名、URL等")
     private String rawAttachments;
 
-    /** 采集时间 */
+    /**
+     * 采集时间
+     */
     @JsonFormat(pattern = "yyyy-MM-dd")
     @Excel(name = "采集时间", width = 30, dateFormat = "yyyy-MM-dd")
     private Date collectionTime;
 
-    /** 关联采集任务ID(便于追溯) */
+    /**
+     * 关联采集任务ID(便于追溯)
+     */
     @Excel(name = "关联采集任务ID", readConverterExp = "便=于追溯")
     private String taskId;
 
-    public void setId(Long id)
-    {
+    public void setId(Long id) {
         this.id = id;
     }
 
-    public Long getId()
-    {
+    public Long getId() {
         return id;
     }
-    public void setPageUrl(String pageUrl)
-    {
+
+    public void setPageUrl(String pageUrl) {
         this.pageUrl = pageUrl;
     }
 
-    public String getPageUrl()
-    {
+    public String getPageUrl() {
         return pageUrl;
     }
-    public void setRawContent(String rawContent)
-    {
+
+    public void setRawContent(String rawContent) {
         this.rawContent = rawContent;
     }
 
-    public String getRawContent()
-    {
+    public void setRawContentFilter(String rawContentFilter) {
+        this.rawContentFilter = rawContentFilter;
+    }
+
+    public String getRawContent() {
         return rawContent;
     }
-    public void setRawAttachments(String rawAttachments)
-    {
+
+    public String getRawContentFilter() {
+        return rawContentFilter;
+    }
+
+    public void setRawAttachments(String rawAttachments) {
         this.rawAttachments = rawAttachments;
     }
 
-    public String getRawAttachments()
-    {
+    public String getRawAttachments() {
         return rawAttachments;
     }
-    public void setCollectionTime(Date collectionTime)
-    {
+
+    public void setCollectionTime(Date collectionTime) {
         this.collectionTime = collectionTime;
     }
 
-    public Date getCollectionTime()
-    {
+    public Date getCollectionTime() {
         return collectionTime;
     }
-    public void setTaskId(String taskId)
-    {
+
+    public void setTaskId(String taskId) {
         this.taskId = taskId;
     }
 
-    public String getTaskId()
-    {
+    public String getTaskId() {
         return taskId;
     }
 
     @Override
     public String toString() {
         return new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE)
-            .append("id", getId())
-            .append("pageUrl", getPageUrl())
-            .append("rawContent", getRawContent())
-            .append("rawAttachments", getRawAttachments())
-            .append("collectionTime", getCollectionTime())
-            .append("taskId", getTaskId())
-            .toString();
+                .append("id", getId())
+                .append("pageUrl", getPageUrl())
+                .append("rawContent", getRawContent())
+                .append("rawContentFilter", getRawContentFilter())
+                .append("rawAttachments", getRawAttachments())
+                .append("collectionTime", getCollectionTime())
+                .append("taskId", getTaskId())
+                .toString();
     }
 
 }

+ 0 - 167
xzl-system/src/main/java/com/xzl/system/domain/SysSpiderStructuredData.java

@@ -1,167 +0,0 @@
-package com.xzl.system.domain;
-
-import java.util.Date;
-import com.fasterxml.jackson.annotation.JsonFormat;
-import com.xzl.common.annotation.Excel;
-import com.xzl.common.core.domain.BaseEntity;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
-
-/**
- * 结构化数据对象 sys_spider_structured_data
- *
- * @author xzl
- * @date 2025-11-13
- */
-public class SysSpiderStructuredData extends BaseEntity
-{
-    private static final long serialVersionUID = 1L;
-
-    /** 结构化数据ID */
-    private Long id;
-
-    /** 关联原始数据ID */
-    @Excel(name = "关联原始数据ID")
-    private Long rawDataId;
-
-    /** 文档标题 */
-    @Excel(name = "文档标题")
-    private String title;
-
-    /** 结构化文本内容 */
-    @Excel(name = "结构化文本内容")
-    private String content;
-
-    /** 作者 */
-    @Excel(name = "作者")
-    private String author;
-
-    /** 所属部门 */
-    @Excel(name = "所属部门")
-    private String department;
-
-    /** 发布时间 */
-    @JsonFormat(pattern = "yyyy-MM-dd")
-    @Excel(name = "发布时间", width = 30, dateFormat = "yyyy-MM-dd")
-    private Date publishTime;
-
-    /** 关键词(逗号分隔) */
-    @Excel(name = "关键词", readConverterExp = "逗=号分隔")
-    private String keywords;
-
-    /** 自定义字段(JSON格式) */
-    @Excel(name = "自定义字段", readConverterExp = "J=SON格式")
-    private String customFields;
-
-    /** 处理时间 */
-    @JsonFormat(pattern = "yyyy-MM-dd")
-    @Excel(name = "处理时间", width = 30, dateFormat = "yyyy-MM-dd")
-    private Date processTime;
-
-    public void setId(Long id)
-    {
-        this.id = id;
-    }
-
-    public Long getId()
-    {
-        return id;
-    }
-    public void setRawDataId(Long rawDataId)
-    {
-        this.rawDataId = rawDataId;
-    }
-
-    public Long getRawDataId()
-    {
-        return rawDataId;
-    }
-    public void setTitle(String title)
-    {
-        this.title = title;
-    }
-
-    public String getTitle()
-    {
-        return title;
-    }
-    public void setContent(String content)
-    {
-        this.content = content;
-    }
-
-    public String getContent()
-    {
-        return content;
-    }
-    public void setAuthor(String author)
-    {
-        this.author = author;
-    }
-
-    public String getAuthor()
-    {
-        return author;
-    }
-    public void setDepartment(String department)
-    {
-        this.department = department;
-    }
-
-    public String getDepartment()
-    {
-        return department;
-    }
-    public void setPublishTime(Date publishTime)
-    {
-        this.publishTime = publishTime;
-    }
-
-    public Date getPublishTime()
-    {
-        return publishTime;
-    }
-    public void setKeywords(String keywords)
-    {
-        this.keywords = keywords;
-    }
-
-    public String getKeywords()
-    {
-        return keywords;
-    }
-    public void setCustomFields(String customFields)
-    {
-        this.customFields = customFields;
-    }
-
-    public String getCustomFields()
-    {
-        return customFields;
-    }
-    public void setProcessTime(Date processTime)
-    {
-        this.processTime = processTime;
-    }
-
-    public Date getProcessTime()
-    {
-        return processTime;
-    }
-
-    @Override
-    public String toString() {
-        return new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE)
-            .append("id", getId())
-            .append("rawDataId", getRawDataId())
-            .append("title", getTitle())
-            .append("content", getContent())
-            .append("author", getAuthor())
-            .append("department", getDepartment())
-            .append("publishTime", getPublishTime())
-            .append("keywords", getKeywords())
-            .append("customFields", getCustomFields())
-            .append("processTime", getProcessTime())
-            .toString();
-    }
-}

+ 0 - 44
xzl-ui/src/api/spiderData/attachments.js

@@ -1,44 +0,0 @@
-import request from '@/utils/request'
-
-// 查询附件列表
-export function listAttachments(query) {
-  return request({
-    url: '/spiderData/attachments/list',
-    method: 'get',
-    params: query
-  })
-}
-
-// 查询附件详细
-export function getAttachments(id) {
-  return request({
-    url: '/spiderData/attachments/' + id,
-    method: 'get'
-  })
-}
-
-// 新增附件
-export function addAttachments(data) {
-  return request({
-    url: '/spiderData/attachments',
-    method: 'post',
-    data: data
-  })
-}
-
-// 修改附件
-export function updateAttachments(data) {
-  return request({
-    url: '/spiderData/attachments',
-    method: 'put',
-    data: data
-  })
-}
-
-// 删除附件
-export function delAttachments(id) {
-  return request({
-    url: '/spiderData/attachments/' + id,
-    method: 'delete'
-  })
-}

+ 0 - 44
xzl-ui/src/api/spiderData/structured.js

@@ -1,44 +0,0 @@
-import request from '@/utils/request'
-
-// 查询结构化数据列表
-export function listStructured(query) {
-  return request({
-    url: '/spiderData/structured/list',
-    method: 'get',
-    params: query
-  })
-}
-
-// 查询结构化数据详细
-export function getStructured(id) {
-  return request({
-    url: '/spiderData/structured/' + id,
-    method: 'get'
-  })
-}
-
-// 新增结构化数据
-export function addStructured(data) {
-  return request({
-    url: '/spiderData/structured',
-    method: 'post',
-    data: data
-  })
-}
-
-// 修改结构化数据
-export function updateStructured(data) {
-  return request({
-    url: '/spiderData/structured',
-    method: 'put',
-    data: data
-  })
-}
-
-// 删除结构化数据
-export function delStructured(id) {
-  return request({
-    url: '/spiderData/structured/' + id,
-    method: 'delete'
-  })
-}

+ 0 - 313
xzl-ui/src/views/spiderData/attachments/index.vue

@@ -1,313 +0,0 @@
-<template>
-  <div class="app-container">
-    <el-form :model="queryParams" ref="queryForm" size="small" :inline="true" v-show="showSearch" label-width="68px">
-      <el-form-item label="附件文件名" prop="fileName">
-        <el-input
-          v-model="queryParams.fileName"
-          placeholder="请输入附件文件名"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item label="OA系统原始URL" prop="originalUrl">
-        <el-input
-          v-model="queryParams.originalUrl"
-          placeholder="请输入OA系统原始URL"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item label="关联采集任务ID" prop="taskId">
-        <el-input
-          v-model="queryParams.taskId"
-          placeholder="请输入关联采集任务ID"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item>
-        <el-button type="primary" icon="el-icon-search" size="mini" @click="handleQuery">搜索</el-button>
-        <el-button icon="el-icon-refresh" size="mini" @click="resetQuery">重置</el-button>
-      </el-form-item>
-    </el-form>
-
-    <el-row :gutter="10" class="mb8">
-      <el-col :span="1.5">
-        <el-button
-          type="primary"
-          plain
-          icon="el-icon-plus"
-          size="mini"
-          @click="handleAdd"
-          v-hasPermi="['spiderData:attachments:add']"
-        >新增</el-button>
-      </el-col>
-      <el-col :span="1.5">
-        <el-button
-          type="success"
-          plain
-          icon="el-icon-edit"
-          size="mini"
-          :disabled="single"
-          @click="handleUpdate"
-          v-hasPermi="['spiderData:attachments:edit']"
-        >修改</el-button>
-      </el-col>
-      <el-col :span="1.5">
-        <el-button
-          type="danger"
-          plain
-          icon="el-icon-delete"
-          size="mini"
-          :disabled="multiple"
-          @click="handleDelete"
-          v-hasPermi="['spiderData:attachments:remove']"
-        >删除</el-button>
-      </el-col>
-      <el-col :span="1.5">
-        <el-button
-          type="warning"
-          plain
-          icon="el-icon-download"
-          size="mini"
-          @click="handleExport"
-          v-hasPermi="['spiderData:attachments:export']"
-        >导出</el-button>
-      </el-col>
-      <right-toolbar :showSearch.sync="showSearch" @queryTable="getList"></right-toolbar>
-    </el-row>
-
-    <el-table v-loading="loading" :data="attachmentsList" @selection-change="handleSelectionChange">
-      <el-table-column type="selection" width="55" align="center" />
-      <el-table-column label="附件ID" align="center" prop="id" />
-      <el-table-column label="关联原始数据ID" align="center" prop="rawDataId" />
-      <el-table-column label="关联结构化数据ID" align="center" prop="structuredDataId" />
-      <el-table-column label="附件文件名" align="center" prop="fileName" />
-      <el-table-column label="文件类型" align="center" prop="fileType" />
-      <el-table-column label="文件大小" align="center" prop="fileSize" />
-      <el-table-column label="OA系统原始URL" align="center" prop="originalUrl" />
-      <el-table-column label="本地存储路径" align="center" prop="localPath" />
-      <el-table-column label="下载时间" align="center" prop="downloadTime" width="180">
-        <template slot-scope="scope">
-          <span>{{ parseTime(scope.row.downloadTime, '{y}-{m}-{d}') }}</span>
-        </template>
-      </el-table-column>
-      <el-table-column label="关联采集任务ID" align="center" prop="taskId" />
-      <el-table-column label="操作" align="center" class-name="small-padding fixed-width">
-        <template slot-scope="scope">
-          <el-button
-            size="mini"
-            type="text"
-            icon="el-icon-edit"
-            @click="handleUpdate(scope.row)"
-            v-hasPermi="['spiderData:attachments:edit']"
-          >修改</el-button>
-          <el-button
-            size="mini"
-            type="text"
-            icon="el-icon-delete"
-            @click="handleDelete(scope.row)"
-            v-hasPermi="['spiderData:attachments:remove']"
-          >删除</el-button>
-        </template>
-      </el-table-column>
-    </el-table>
-    
-    <pagination
-      v-show="total>0"
-      :total="total"
-      :page.sync="queryParams.pageNum"
-      :limit.sync="queryParams.pageSize"
-      @pagination="getList"
-    />
-
-    <!-- 添加或修改附件对话框 -->
-    <el-dialog :title="title" :visible.sync="open" width="500px" append-to-body>
-      <el-form ref="form" :model="form" :rules="rules" label-width="80px">
-        <el-form-item label="关联原始数据ID" prop="rawDataId">
-          <el-input v-model="form.rawDataId" placeholder="请输入关联原始数据ID" />
-        </el-form-item>
-        <el-form-item label="关联结构化数据ID" prop="structuredDataId">
-          <el-input v-model="form.structuredDataId" placeholder="请输入关联结构化数据ID" />
-        </el-form-item>
-        <el-form-item label="附件文件名" prop="fileName">
-          <el-input v-model="form.fileName" placeholder="请输入附件文件名" />
-        </el-form-item>
-        <el-form-item label="文件大小" prop="fileSize">
-          <el-input v-model="form.fileSize" placeholder="请输入文件大小" />
-        </el-form-item>
-        <el-form-item label="OA系统原始URL" prop="originalUrl">
-          <el-input v-model="form.originalUrl" placeholder="请输入OA系统原始URL" />
-        </el-form-item>
-        <el-form-item label="本地存储路径" prop="localPath">
-          <el-input v-model="form.localPath" placeholder="请输入本地存储路径" />
-        </el-form-item>
-        <el-form-item label="下载时间" prop="downloadTime">
-          <el-date-picker clearable
-            v-model="form.downloadTime"
-            type="date"
-            value-format="yyyy-MM-dd"
-            placeholder="请选择下载时间">
-          </el-date-picker>
-        </el-form-item>
-        <el-form-item label="关联采集任务ID" prop="taskId">
-          <el-input v-model="form.taskId" placeholder="请输入关联采集任务ID" />
-        </el-form-item>
-      </el-form>
-      <div slot="footer" class="dialog-footer">
-        <el-button type="primary" @click="submitForm">确 定</el-button>
-        <el-button @click="cancel">取 消</el-button>
-      </div>
-    </el-dialog>
-  </div>
-</template>
-
-<script>
-import { listAttachments, getAttachments, delAttachments, addAttachments, updateAttachments } from "@/api/spiderData/attachments";
-
-export default {
-  name: "Attachments",
-  data() {
-    return {
-      // 遮罩层
-      loading: true,
-      // 选中数组
-      ids: [],
-      // 非单个禁用
-      single: true,
-      // 非多个禁用
-      multiple: true,
-      // 显示搜索条件
-      showSearch: true,
-      // 总条数
-      total: 0,
-      // 附件表格数据
-      attachmentsList: [],
-      // 弹出层标题
-      title: "",
-      // 是否显示弹出层
-      open: false,
-      // 查询参数
-      queryParams: {
-        pageNum: 1,
-        pageSize: 10,
-        fileName: null,
-        fileType: null,
-        originalUrl: null,
-        taskId: null
-      },
-      // 表单参数
-      form: {},
-      // 表单校验
-      rules: {
-        rawDataId: [
-          { required: true, message: "关联原始数据ID不能为空", trigger: "blur" }
-        ],
-      }
-    };
-  },
-  created() {
-    this.getList();
-  },
-  methods: {
-    /** 查询附件列表 */
-    getList() {
-      this.loading = true;
-      listAttachments(this.queryParams).then(response => {
-        this.attachmentsList = response.rows;
-        this.total = response.total;
-        this.loading = false;
-      });
-    },
-    // 取消按钮
-    cancel() {
-      this.open = false;
-      this.reset();
-    },
-    // 表单重置
-    reset() {
-      this.form = {
-        id: null,
-        rawDataId: null,
-        structuredDataId: null,
-        fileName: null,
-        fileType: null,
-        fileSize: null,
-        originalUrl: null,
-        localPath: null,
-        downloadTime: null,
-        taskId: null
-      };
-      this.resetForm("form");
-    },
-    /** 搜索按钮操作 */
-    handleQuery() {
-      this.queryParams.pageNum = 1;
-      this.getList();
-    },
-    /** 重置按钮操作 */
-    resetQuery() {
-      this.resetForm("queryForm");
-      this.handleQuery();
-    },
-    // 多选框选中数据
-    handleSelectionChange(selection) {
-      this.ids = selection.map(item => item.id)
-      this.single = selection.length!==1
-      this.multiple = !selection.length
-    },
-    /** 新增按钮操作 */
-    handleAdd() {
-      this.reset();
-      this.open = true;
-      this.title = "添加附件";
-    },
-    /** 修改按钮操作 */
-    handleUpdate(row) {
-      this.reset();
-      const id = row.id || this.ids
-      getAttachments(id).then(response => {
-        this.form = response.data;
-        this.open = true;
-        this.title = "修改附件";
-      });
-    },
-    /** 提交按钮 */
-    submitForm() {
-      this.$refs["form"].validate(valid => {
-        if (valid) {
-          if (this.form.id != null) {
-            updateAttachments(this.form).then(response => {
-              this.$modal.msgSuccess("修改成功");
-              this.open = false;
-              this.getList();
-            });
-          } else {
-            addAttachments(this.form).then(response => {
-              this.$modal.msgSuccess("新增成功");
-              this.open = false;
-              this.getList();
-            });
-          }
-        }
-      });
-    },
-    /** 删除按钮操作 */
-    handleDelete(row) {
-      const ids = row.id || this.ids;
-      this.$modal.confirm('是否确认删除附件编号为"' + ids + '"的数据项?').then(function() {
-        return delAttachments(ids);
-      }).then(() => {
-        this.getList();
-        this.$modal.msgSuccess("删除成功");
-      }).catch(() => {});
-    },
-    /** 导出按钮操作 */
-    handleExport() {
-      this.download('spiderData/attachments/export', {
-        ...this.queryParams
-      }, `attachments_${new Date().getTime()}.xlsx`)
-    }
-  }
-};
-</script>

+ 297 - 99
xzl-ui/src/views/spiderData/sourceData/index.vue

@@ -1,6 +1,14 @@
 <template>
   <div class="app-container">
-    <el-form :model="queryParams" ref="queryForm" size="small" :inline="true" v-show="showSearch" label-width="68px">
+    <!-- 搜索表单 -->
+    <el-form
+      :model="queryParams"
+      ref="queryForm"
+      size="small"
+      :inline="true"
+      v-show="showSearch"
+      label-width="68px"
+    >
       <el-form-item label="采集页面URL" prop="pageUrl">
         <el-input
           v-model="queryParams.pageUrl"
@@ -35,6 +43,7 @@
       </el-form-item>
     </el-form>
 
+    <!-- 操作按钮区 -->
     <el-row :gutter="10" class="mb8">
       <el-col :span="1.5">
         <el-button
@@ -43,8 +52,9 @@
           icon="el-icon-plus"
           size="mini"
           @click="handleAdd"
-          v-hasPermi="['spiderData:spiderData:add']"
-        >新增</el-button>
+          v-hasPermi="['spiderData:sourceData:add']"
+        >新增
+        </el-button>
       </el-col>
       <el-col :span="1.5">
         <el-button
@@ -52,10 +62,11 @@
           plain
           icon="el-icon-edit"
           size="mini"
-          :disabled="single"
+          :disabled="!hasSingleSelection"
           @click="handleUpdate"
-          v-hasPermi="['spiderData:spiderData:edit']"
-        >修改</el-button>
+          v-hasPermi="['spiderData:sourceData:edit']"
+        >修改
+        </el-button>
       </el-col>
       <el-col :span="1.5">
         <el-button
@@ -63,10 +74,11 @@
           plain
           icon="el-icon-delete"
           size="mini"
-          :disabled="multiple"
+          :disabled="!hasMultipleSelection"
           @click="handleDelete"
-          v-hasPermi="['spiderData:spiderData:remove']"
-        >删除</el-button>
+          v-hasPermi="['spiderData:sourceData:remove']"
+        >删除
+        </el-button>
       </el-col>
       <el-col :span="1.5">
         <el-button
@@ -75,60 +87,108 @@
           icon="el-icon-download"
           size="mini"
           @click="handleExport"
-          v-hasPermi="['spiderData:spiderData:export']"
-        >导出</el-button>
+          v-hasPermi="['spiderData:sourceData:export']"
+        >导出
+        </el-button>
       </el-col>
       <right-toolbar :showSearch.sync="showSearch" @queryTable="getList"></right-toolbar>
     </el-row>
 
-    <el-table v-loading="loading" :data="spiderDataList" @selection-change="handleSelectionChange">
-      <el-table-column type="selection" width="55" align="center" />
-      <!-- <el-table-column label="原始数据ID" align="center" prop="id" /> -->
-      <el-table-column label="采集页面URL" align="center" prop="pageUrl" />
-<!--      <el-table-column label="原始文本内容" align="center" prop="rawContent" width="500" show-overflow-tooltip/>-->
-       <el-table-column label="原始文本内容" align="center" prop="rawContent" width="300">
-        <template slot-scope="scope">
+    <!-- 数据表格 -->
+    <el-table
+      v-loading="loading"
+      :data="spiderDataList"
+      @selection-change="handleSelectionChange"
+      row-key="id"
+    >
+      <el-table-column type="selection" width="55" align="center"/>
+
+      <el-table-column label="采集页面URL" align="center" prop="pageUrl" width="300">
+        <template #default="scope">
+          <el-link
+            type="primary"
+            :href="scope.row.pageUrl"
+            target="_blank"
+            :underline="true"
+            class="text-ellipsis"
+          >
+            {{ formatLongText(scope.row.pageUrl, 30) }}
+          </el-link>
+        </template>
+      </el-table-column>
+
+      <el-table-column label="原始文本内容" align="center" prop="rawContent" width="200">
+        <template #default="scope">
+          <el-link
+            type="primary"
+            @click="openRichTextNewPage(scope.row.rawContent, '原始文本内容')"
+            :underline="false"
+            class="text-ellipsis"
+          >
+            {{ formatLongText(scope.row.rawContent, 50) }}
+          </el-link>
+        </template>
+      </el-table-column>
+
+      <el-table-column label="优化文本内容" align="center" prop="rawContentFilter" width="200">
+        <template #default="scope">
           <el-link
             type="primary"
-            @click="openRichTextPage(scope.row.rawContent)"
+            @click="openRichTextNewPage(scope.row.rawContentFilter, '优化文本内容')"
             :underline="false"
             class="text-ellipsis"
           >
-            {{ scope.row.rawContent.length > 50 ? scope.row.rawContent.substring(0, 50) + '...' : (scope.row.rawContent || '无') }}
+            {{ formatLongText(scope.row.rawContentFilter, 50) }}
           </el-link>
         </template>
-      </el-table-column> 
-      
+      </el-table-column>
+
+      <el-table-column label="原始附件信息" align="center" prop="rawAttachments" width="200">
+        <template #default="scope">
+          <el-link
+            type="primary"
+            @click="openRichTextNewPage(scope.row.rawAttachments, '原始附件信息')"
+            :underline="false"
+            class="text-ellipsis"
+          >
+            {{ formatLongText(scope.row.rawAttachments, 50) }}
+          </el-link>
+        </template>
+      </el-table-column>
 
-      <el-table-column label="原始附件信息" align="center" prop="rawAttachments" width="300" show-overflow-tooltip/>
       <el-table-column label="采集时间" align="center" prop="collectionTime" width="180">
-        <template slot-scope="scope">
+        <template #default="scope">
           <span>{{ parseTime(scope.row.collectionTime, '{y}-{m}-{d}') }}</span>
         </template>
       </el-table-column>
-      <el-table-column label="关联采集任务ID" align="center" prop="taskId" />
+
+      <el-table-column label="关联采集任务ID" align="center" prop="taskId"/>
+
       <el-table-column label="操作" align="center" class-name="small-padding fixed-width">
-        <template slot-scope="scope">
+        <template #default="scope">
           <el-button
             size="mini"
             type="text"
             icon="el-icon-edit"
             @click="handleUpdate(scope.row)"
-            v-hasPermi="['spiderData:spiderData:edit']"
-          >修改</el-button>
+            v-hasPermi="['spiderData:sourceData:edit']"
+          >修改
+          </el-button>
           <el-button
             size="mini"
             type="text"
             icon="el-icon-delete"
             @click="handleDelete(scope.row)"
-            v-hasPermi="['spiderData:spiderData:remove']"
-          >删除</el-button>
+            v-hasPermi="['spiderData:sourceData:remove']"
+          >删除
+          </el-button>
         </template>
       </el-table-column>
     </el-table>
 
+    <!-- 分页组件 -->
     <pagination
-      v-show="total>0"
+      v-show="total > 0"
       :total="total"
       :page.sync="queryParams.pageNum"
       :limit.sync="queryParams.pageSize"
@@ -139,13 +199,16 @@
     <el-dialog :title="title" :visible.sync="open" width="1000px" append-to-body>
       <el-form ref="form" :model="form" :rules="rules" label-width="80px">
         <el-form-item label="采集页面URL" prop="pageUrl">
-          <el-input v-model="form.pageUrl" placeholder="请输入采集页面URL" />
+          <el-input v-model="form.pageUrl" placeholder="请输入采集页面URL"/>
         </el-form-item>
         <el-form-item label="原始文本内容">
           <editor v-model="form.rawContent" :min-height="192"/>
         </el-form-item>
+        <el-form-item label="优化文本内容">
+          <editor v-model="form.rawContentFilter" :min-height="192"/>
+        </el-form-item>
         <el-form-item label="原始附件信息" prop="rawAttachments">
-          <el-input v-model="form.rawAttachments" type="textarea" placeholder="请输入内容" />
+          <el-input v-model="form.rawAttachments" type="textarea" placeholder="请输入内容"/>
         </el-form-item>
       </el-form>
       <div slot="footer" class="dialog-footer">
@@ -156,8 +219,20 @@
   </div>
 </template>
 
+
 <script>
-import { listSpiderData, getSpiderData, delSpiderData, addSpiderData, updateSpiderData,spiderCollect  } from "@/api/spiderData/spiderData";
+import {
+  listSpiderData,
+  getSpiderData,
+  delSpiderData,
+  addSpiderData,
+  updateSpiderData,
+  spiderCollect
+} from "@/api/spiderData/spiderData";
+
+// 常量定义
+const DEFAULT_PAGE_SIZE = 10;
+const DEFAULT_PAGE_NUM = 1;
 
 export default {
   name: "SpiderData",
@@ -167,10 +242,6 @@ export default {
       loading: true,
       // 选中数组
       ids: [],
-      // 非单个禁用
-      single: true,
-      // 非多个禁用
-      multiple: true,
       // 显示搜索条件
       showSearch: true,
       // 总条数
@@ -181,14 +252,15 @@ export default {
       title: "",
       // 是否显示弹出层
       open: false,
-      // 原始附件信息时间范围
+      // 采集时间范围
       daterangeCollectionTime: [],
       // 查询参数
       queryParams: {
-        pageNum: 1,
-        pageSize: 10,
+        pageNum: DEFAULT_PAGE_NUM,
+        pageSize: DEFAULT_PAGE_SIZE,
         pageUrl: null,
         rawContent: null,
+        rawContentFilter: null,
         rawAttachments: null,
         collectionTime: null,
         taskId: null
@@ -196,36 +268,112 @@ export default {
       // 表单参数
       form: {},
       // 表单校验
-      rules: {
-      }
+      rules: {},
+      // 新增附件弹窗相关数据
+      attachmentsDialogVisible: false,
+      attachments: []
     };
   },
+  computed: {
+    // 计算属性:是否有单个选中项
+    hasSingleSelection() {
+      return this.ids.length === 1;
+    },
+    // 计算属性:是否有选中项(用于删除)
+    hasMultipleSelection() {
+      return this.ids.length > 0;
+    }
+  },
   created() {
     this.getList();
   },
   methods: {
-    /** 打开富文本新页面 */
-    openRichTextPage(rawContent) {
-      // 路由跳转,携带原始文本参数(通过query传递)
-      this.$router.push({
-        path: '/spiderData/richTextView', // 新页面路由路径
-        query: { content: rawContent || '' } // 传递原始文本
-      });
+    /** 格式化长文本,超过指定长度显示省略号 */
+    formatLongText(text = '', maxLength) {
+      if (!text) return '无';
+      return text.length > maxLength
+        ? `${text.substring(0, maxLength)}...`
+        : text;
     },
+
+    /** 打开富文本新页面(新窗口) */
+    openRichTextNewPage(content, title = '文本内容') {
+      // 创建一个临时HTML内容
+      const htmlContent = `
+        <!DOCTYPE html>
+        <html lang="zh-CN">
+        <head>
+          <meta charset="UTF-8">
+          <meta name="viewport" content="width=device-width, initial-scale=1.0">
+          <title>${title}</title>
+          <style>
+            body {
+              font-family: Arial, sans-serif;
+              padding: 20px;
+              line-height: 1.6;
+            }
+            .container {
+              max-width: 1200px;
+              margin: 0 auto;
+            }
+            .title {
+              color: #333;
+              border-bottom: 1px solid #eee;
+              padding-bottom: 10px;
+              margin-bottom: 20px;
+            }
+          </style>
+        </head>
+        <body>
+          <div class="container">
+            <h2 class="title">${title}</h2>
+            <div class="content">${content || '无内容'}</div>
+          </div>
+        </body>
+        </html>
+      `;
+
+      // 创建Blob对象
+      const blob = new Blob([htmlContent], { type: 'text/html' });
+      // 创建临时URL
+      const url = URL.createObjectURL(blob);
+      // 打开新窗口
+      window.open(url, '_blank');
+
+      // 释放URL对象
+      setTimeout(() => {
+        URL.revokeObjectURL(url);
+      }, 1000);
+    },
+
     /** 查询原始采集数据列表 */
     getList() {
       this.loading = true;
-      this.queryParams.params = {};
-      if (null != this.daterangeCollectionTime && '' != this.daterangeCollectionTime) {
-        this.queryParams.params["beginCollectionTime"] = this.daterangeCollectionTime[0];
-        this.queryParams.params["endCollectionTime"] = this.daterangeCollectionTime[1];
+      // 处理查询参数
+      const params = { ...this.queryParams };
+
+      // 添加时间范围查询参数
+      if (this.daterangeCollectionTime && this.daterangeCollectionTime.length === 2) {
+        params.params = {
+          beginCollectionTime: this.daterangeCollectionTime[0],
+          endCollectionTime: this.daterangeCollectionTime[1]
+        };
       }
-      listSpiderData(this.queryParams).then(response => {
-        this.spiderDataList = response.rows;
-        this.total = response.total;
-        this.loading = false;
-      });
+
+      listSpiderData(params)
+        .then(response => {
+          this.spiderDataList = response.rows || [];
+          this.total = response.total || 0;
+        })
+        .catch(error => {
+          console.error('获取数据列表失败:', error);
+          this.$modal.msgError('获取数据失败,请重试');
+        })
+        .finally(() => {
+          this.loading = false;
+        });
     },
+
     /** 采集按钮操作 */
     spiderQuery() {
       // 校验:若pageUrl为空,给出提示
@@ -234,103 +382,153 @@ export default {
         return;
       }
 
-      // 调用封装好的采集接口
-      spiderCollect({
-        pageUrl: this.queryParams.pageUrl // 传递采集页面URL
-      }).then(response => {
-        this.$modal.msgSuccess("采集请求已发送,处理中...");
-        // 若需要刷新列表,可调用getList()
-        this.getList();
-      }).catch(error => {
-        this.$modal.msgError("采集失败:" + (error.response?.data?.msg || "服务器异常"));
-      });
+      // 显示加载状态
+      this.loading = true;
+
+      // 调用采集接口
+      spiderCollect({ pageUrl: this.queryParams.pageUrl })
+        .then(() => {
+          this.$modal.msgSuccess("采集请求已发送,处理中...");
+          this.getList(); // 刷新列表
+        })
+        .catch(error => {
+          this.$modal.msgError(`采集失败:${error.response?.data?.msg || "服务器异常"}`);
+        })
+        .finally(() => {
+          this.loading = false;
+        });
     },
+
     // 取消按钮
     cancel() {
       this.open = false;
       this.reset();
     },
+
     // 表单重置
     reset() {
       this.form = {
         id: null,
         pageUrl: null,
         rawContent: null,
+        rawContentFilter: null,
         rawAttachments: null,
         collectionTime: null,
         taskId: null
       };
       this.resetForm("form");
     },
+
     /** 搜索按钮操作 */
     handleQuery() {
-      this.queryParams.pageNum = 1;
+      this.queryParams.pageNum = DEFAULT_PAGE_NUM;
       this.getList();
     },
+
     /** 重置按钮操作 */
     resetQuery() {
       this.daterangeCollectionTime = [];
       this.resetForm("queryForm");
       this.handleQuery();
     },
+
     // 多选框选中数据
     handleSelectionChange(selection) {
-      this.ids = selection.map(item => item.id)
-      this.single = selection.length!==1
-      this.multiple = !selection.length
+      this.ids = selection.map(item => item.id);
     },
+
     /** 新增按钮操作 */
     handleAdd() {
       this.reset();
       this.open = true;
       this.title = "添加原始采集数据";
     },
+
     /** 修改按钮操作 */
     handleUpdate(row) {
       this.reset();
-      const id = row.id || this.ids
-      getSpiderData(id).then(response => {
-        this.form = response.data;
-        this.open = true;
-        this.title = "修改原始采集数据";
-      });
+      const id = row?.id || this.ids[0];
+      if (!id) return;
+
+      getSpiderData(id)
+        .then(response => {
+          this.form = response.data;
+          this.open = true;
+          this.title = "修改原始采集数据";
+        })
+        .catch(error => {
+          console.error('获取数据详情失败:', error);
+          this.$modal.msgError('获取数据详情失败,请重试');
+        });
     },
+
     /** 提交按钮 */
     submitForm() {
       this.$refs["form"].validate(valid => {
         if (valid) {
-          if (this.form.id != null) {
-            updateSpiderData(this.form).then(response => {
-              this.$modal.msgSuccess("修改成功");
-              this.open = false;
-              this.getList();
-            });
-          } else {
-            addSpiderData(this.form).then(response => {
-              this.$modal.msgSuccess("新增成功");
+          const isEdit = this.form.id != null;
+          const apiMethod = isEdit ? updateSpiderData : addSpiderData;
+
+          apiMethod(this.form)
+            .then(() => {
+              this.$modal.msgSuccess(isEdit ? "修改成功" : "新增成功");
               this.open = false;
               this.getList();
+            })
+            .catch(error => {
+              console.error(`${isEdit ? '修改' : '新增'}失败:`, error);
+              this.$modal.msgError(`${isEdit ? '修改' : '新增'}失败,请重试`);
             });
-          }
         }
       });
     },
+
     /** 删除按钮操作 */
     handleDelete(row) {
-      const ids = row.id || this.ids;
-      this.$modal.confirm('是否确认删除原始采集数据编号为"' + ids + '"的数据项?').then(function() {
-        return delSpiderData(ids);
-      }).then(() => {
-        this.getList();
-        this.$modal.msgSuccess("删除成功");
-      }).catch(() => {});
+      const ids = row?.id || this.ids;
+      if (!ids || (Array.isArray(ids) && ids.length === 0)) return;
+
+      this.$modal.confirm(`是否确认删除原始采集数据编号为"${ids}"的数据项?`)
+        .then(() => delSpiderData(ids))
+        .then(() => {
+          this.getList();
+          this.$modal.msgSuccess("删除成功");
+        })
+        .catch(() => {
+          // 取消删除时的处理
+        });
     },
+
     /** 导出按钮操作 */
     handleExport() {
-      this.download('spiderData/sourceData/export', {
-        ...this.queryParams
-      }, `spiderData_${new Date().getTime()}.xlsx`)
+      this.download(
+        'spiderData/sourceData/export',
+        { ...this.queryParams },
+        `spiderData_${new Date().getTime()}.xlsx`
+      );
     }
   }
 };
 </script>
+
+<style scoped>
+/* 附件列表样式,避免布局错乱 */
+.attachment-item {
+  padding: 12px 0;
+  border-bottom: 1px solid #f5f5f5;
+}
+.attachment-item:last-child {
+  border-bottom: none;
+}
+.text-ellipsis {
+  display: block;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+}
+/* 容器样式 */
+.app-container {
+  padding: 16px;
+  box-sizing: border-box;
+}
+</style>

+ 475 - 0
xzl-ui/src/views/spiderData/sourceData/index.vuebak

@@ -0,0 +1,475 @@
+<template>
+  <div class="app-container">
+    <el-form :model="queryParams" ref="queryForm" size="small" :inline="true" v-show="showSearch" label-width="68px">
+      <el-form-item label="采集页面URL" prop="pageUrl">
+        <el-input
+          v-model="queryParams.pageUrl"
+          placeholder="请输入采集页面URL"
+          clearable
+          @keyup.enter.native="handleQuery"
+        />
+      </el-form-item>
+      <el-form-item label="采集时间">
+        <el-date-picker
+          v-model="daterangeCollectionTime"
+          style="width: 240px"
+          value-format="yyyy-MM-dd"
+          type="daterange"
+          range-separator="-"
+          start-placeholder="开始日期"
+          end-placeholder="结束日期"
+        ></el-date-picker>
+      </el-form-item>
+      <el-form-item label="关联采集任务ID" prop="taskId">
+        <el-input
+          v-model="queryParams.taskId"
+          placeholder="请输入关联采集任务ID"
+          clearable
+          @keyup.enter.native="handleQuery"
+        />
+      </el-form-item>
+      <el-form-item>
+        <el-button type="primary" icon="el-icon-search" size="mini" @click="handleQuery">搜索</el-button>
+        <el-button icon="el-icon-refresh" size="mini" @click="resetQuery">重置</el-button>
+        <el-button icon="el-icon-document" size="mini" @click="spiderQuery">采集</el-button>
+      </el-form-item>
+    </el-form>
+
+    <el-row :gutter="10" class="mb8">
+      <el-col :span="1.5">
+        <el-button
+          type="primary"
+          plain
+          icon="el-icon-plus"
+          size="mini"
+          @click="handleAdd"
+          v-hasPermi="['spiderData:sourceData:add']"
+        >新增
+        </el-button>
+      </el-col>
+      <el-col :span="1.5">
+        <el-button
+          type="success"
+          plain
+          icon="el-icon-edit"
+          size="mini"
+          :disabled="single"
+          @click="handleUpdate"
+          v-hasPermi="['spiderData:sourceData:edit']"
+        >修改
+        </el-button>
+      </el-col>
+      <el-col :span="1.5">
+        <el-button
+          type="danger"
+          plain
+          icon="el-icon-delete"
+          size="mini"
+          :disabled="multiple"
+          @click="handleDelete"
+          v-hasPermi="['spiderData:sourceData:remove']"
+        >删除
+        </el-button>
+      </el-col>
+      <el-col :span="1.5">
+        <el-button
+          type="warning"
+          plain
+          icon="el-icon-download"
+          size="mini"
+          @click="handleExport"
+          v-hasPermi="['spiderData:sourceData:export']"
+        >导出
+        </el-button>
+      </el-col>
+      <right-toolbar :showSearch.sync="showSearch" @queryTable="getList"></right-toolbar>
+    </el-row>
+
+    <el-table v-loading="loading" :data="spiderDataList" @selection-change="handleSelectionChange">
+      <el-table-column type="selection" width="55" align="center"/>
+      <!-- <el-table-column label="原始数据ID" align="center" prop="id" /> -->
+      <!-- <el-table-column label="采集页面URL" align="center" prop="pageUrl" /> -->
+      <!-- 修改为以下代码 -->
+      <el-table-column label="采集页面URL" align="center" prop="pageUrl" width="300">
+        <template slot-scope="scope">
+          <el-link
+            type="primary"
+            :href="scope.row.pageUrl"
+            target="_blank"
+            :underline="true"
+            class="text-ellipsis"
+          >
+            {{
+              scope.row.pageUrl.length > 30 ? scope.row.pageUrl.substring(0, 30) + '...' : (scope.row.pageUrl || '无')
+            }}
+          </el-link>
+        </template>
+      </el-table-column>
+      <!--      <el-table-column label="原始文本内容" align="center" prop="rawContent" width="500" show-overflow-tooltip/>-->
+      <!-- 修改原始文本内容列的el-link部分 -->
+      <el-table-column label="原始文本内容" align="center" prop="rawContent" width="200">
+        <template slot-scope="scope">
+          <el-link
+            type="primary"
+            @click="openRichTextNewPage(scope.row.rawContent)"
+            :underline="false"
+            class="text-ellipsis"
+          >
+            {{
+              scope.row.rawContent.length > 50 ? scope.row.rawContent.substring(0, 50) + '...' : (scope.row.rawContent || '无')
+            }}
+          </el-link>
+        </template>
+      </el-table-column>
+      <!-- <el-table-column label="优化文本内容" align="center" prop="rawContentFilter" width="500" show-overflow-tooltip/> -->
+      <el-table-column label="优化文本内容" align="center" prop="rawContentFilter" width="200">
+        <template slot-scope="scope">
+          <el-link
+            type="primary"
+            @click="openRichTextNewPage(scope.row.rawContentFilter)"
+            :underline="false"
+            class="text-ellipsis"
+          >
+            {{
+              scope.row.rawContentFilter.length > 50 ? scope.row.rawContentFilter.substring(0, 50) + '...' : (scope.row.rawContentFilter || '无')
+            }}
+          </el-link>
+        </template>
+      </el-table-column>
+
+
+      <!-- <el-table-column label="原始附件信息" align="center" prop="rawAttachments" width="300" show-overflow-tooltip/> -->
+      <el-table-column label="原始附件信息" align="center" prop="rawAttachments" width="200">
+        <template slot-scope="scope">
+          <el-link
+            type="primary"
+            @click="openRichTextNewPage(scope.row.rawAttachments)"
+            :underline="false"
+            class="text-ellipsis"
+          >
+            {{
+              scope.row.rawAttachments.length > 50 ? scope.row.rawAttachments.substring(0, 50) + '...' : (scope.row.rawAttachments || '无')
+            }}
+          </el-link>
+        </template>
+      </el-table-column>
+
+
+      <el-table-column label="采集时间" align="center" prop="collectionTime" width="180">
+        <template slot-scope="scope">
+          <span>{{ parseTime(scope.row.collectionTime, '{y}-{m}-{d}') }}</span>
+        </template>
+      </el-table-column>
+      <el-table-column label="关联采集任务ID" align="center" prop="taskId"/>
+      <el-table-column label="操作" align="center" class-name="small-padding fixed-width">
+        <template slot-scope="scope">
+          <el-button
+            size="mini"
+            type="text"
+            icon="el-icon-edit"
+            @click="handleUpdate(scope.row)"
+            v-hasPermi="['spiderData:sourceData:edit']"
+          >修改
+          </el-button>
+          <el-button
+            size="mini"
+            type="text"
+            icon="el-icon-delete"
+            @click="handleDelete(scope.row)"
+            v-hasPermi="['spiderData:sourceData:remove']"
+          >删除
+          </el-button>
+        </template>
+      </el-table-column>
+    </el-table>
+
+    <pagination
+      v-show="total>0"
+      :total="total"
+      :page.sync="queryParams.pageNum"
+      :limit.sync="queryParams.pageSize"
+      @pagination="getList"
+    />
+
+    <!-- 添加或修改原始采集数据对话框 -->
+    <el-dialog :title="title" :visible.sync="open" width="1000px" append-to-body>
+      <el-form ref="form" :model="form" :rules="rules" label-width="80px">
+        <el-form-item label="采集页面URL" prop="pageUrl">
+          <el-input v-model="form.pageUrl" placeholder="请输入采集页面URL"/>
+        </el-form-item>
+        <el-form-item label="原始文本内容">
+          <editor v-model="form.rawContent" :min-height="192"/>
+        </el-form-item>
+        <el-form-item label="优化文本内容">
+          <editor v-model="form.rawContentFilter" :min-height="192"/>
+        </el-form-item>
+        <el-form-item label="原始附件信息" prop="rawAttachments">
+          <el-input v-model="form.rawAttachments" type="textarea" placeholder="请输入内容"/>
+        </el-form-item>
+      </el-form>
+      <div slot="footer" class="dialog-footer">
+        <el-button type="primary" @click="submitForm">确 定</el-button>
+        <el-button @click="cancel">取 消</el-button>
+      </div>
+    </el-dialog>
+  </div>
+</template>
+
+
+<script>
+import {
+  listSpiderData,
+  getSpiderData,
+  delSpiderData,
+  addSpiderData,
+  updateSpiderData,
+  spiderCollect
+} from "@/api/spiderData/spiderData";
+
+export default {
+  name: "SpiderData",
+  data() {
+    return {
+      // 遮罩层
+      loading: true,
+      // 选中数组
+      ids: [],
+      // 非单个禁用
+      single: true,
+      // 非多个禁用
+      multiple: true,
+      // 显示搜索条件
+      showSearch: true,
+      // 总条数
+      total: 0,
+      // 原始采集数据表格数据
+      spiderDataList: [],
+      // 弹出层标题
+      title: "",
+      // 是否显示弹出层
+      open: false,
+      // 原始附件信息时间范围
+      daterangeCollectionTime: [],
+      // 查询参数
+      queryParams: {
+        pageNum: 1,
+        pageSize: 10,
+        pageUrl: null,
+        rawContent: null,
+        rawContentFilter: null,
+        rawAttachments: null,
+        collectionTime: null,
+        taskId: null
+      },
+      // 表单参数
+      form: {},
+      // 表单校验
+      rules: {},
+      // 新增附件弹窗相关数据
+      attachmentsDialogVisible: false,
+      attachments: []
+    };
+  },
+  created() {
+    this.getList();
+  },
+  methods: {
+
+    /** 打开富文本新页面(新窗口) */
+    openRichTextNewPage(rawContent) {
+      // 创建一个临时HTML内容
+      const htmlContent = `
+      <!DOCTYPE html>
+      <html lang="zh-CN">
+      <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>原始文本内容</title>
+        <style>
+          body {
+            font-family: Arial, sans-serif;
+            padding: 20px;
+            line-height: 1.6;
+          }
+          .container {
+            max-width: 1200px;
+            margin: 0 auto;
+          }
+          .title {
+            color: #333;
+            border-bottom: 1px solid #eee;
+            padding-bottom: 10px;
+            margin-bottom: 20px;
+          }
+        </style>
+      </head>
+      <body>
+        <div class="container">
+          <h2 class="title">原始文本内容</h2>
+          <div class="content">${rawContent || '无内容'}</div>
+        </div>
+      </body>
+      </html>
+    `;
+
+      // 创建Blob对象
+      const blob = new Blob([htmlContent], {type: 'text/html'});
+      // 创建临时URL
+      const url = URL.createObjectURL(blob);
+      // 打开新窗口
+      window.open(url, '_blank');
+
+      // 释放URL对象(可选,浏览器会自动回收,但手动释放更规范)
+      setTimeout(() => {
+        URL.revokeObjectURL(url);
+      }, 1000);
+    },
+    /** 查询原始采集数据列表 */
+    getList() {
+      this.loading = true;
+      this.queryParams.params = {};
+      if (null != this.daterangeCollectionTime && '' != this.daterangeCollectionTime) {
+        this.queryParams.params["beginCollectionTime"] = this.daterangeCollectionTime[0];
+        this.queryParams.params["endCollectionTime"] = this.daterangeCollectionTime[1];
+      }
+      listSpiderData(this.queryParams).then(response => {
+        this.spiderDataList = response.rows;
+        this.total = response.total;
+        this.loading = false;
+      });
+    },
+    /** 采集按钮操作 */
+    spiderQuery() {
+      // 校验:若pageUrl为空,给出提示
+      if (!this.queryParams.pageUrl) {
+        this.$modal.msgWarning("请输入采集页面URL");
+        return;
+      }
+
+      // 调用封装好的采集接口
+      spiderCollect({
+        pageUrl: this.queryParams.pageUrl // 传递采集页面URL
+      }).then(response => {
+        this.$modal.msgSuccess("采集请求已发送,处理中...");
+        // 若需要刷新列表,可调用getList()
+        this.getList();
+      }).catch(error => {
+        this.$modal.msgError("采集失败:" + (error.response?.data?.msg || "服务器异常"));
+      });
+    },
+    // 取消按钮
+    cancel() {
+      this.open = false;
+      this.reset();
+    },
+    // 表单重置
+    reset() {
+      this.form = {
+        id: null,
+        pageUrl: null,
+        rawContent: null,
+        rawContentFilter: null,
+        rawAttachments: null,
+        collectionTime: null,
+        taskId: null
+      };
+      this.resetForm("form");
+    },
+    /** 搜索按钮操作 */
+    handleQuery() {
+      this.queryParams.pageNum = 1;
+      this.getList();
+    },
+    /** 重置按钮操作 */
+    resetQuery() {
+      this.daterangeCollectionTime = [];
+      this.resetForm("queryForm");
+      this.handleQuery();
+    },
+    // 多选框选中数据
+    handleSelectionChange(selection) {
+      this.ids = selection.map(item => item.id)
+      this.single = selection.length !== 1
+      this.multiple = !selection.length
+    },
+    /** 新增按钮操作 */
+    handleAdd() {
+      this.reset();
+      this.open = true;
+      this.title = "添加原始采集数据";
+    },
+    /** 修改按钮操作 */
+    handleUpdate(row) {
+      this.reset();
+      const id = row.id || this.ids
+      getSpiderData(id).then(response => {
+        this.form = response.data;
+        this.open = true;
+        this.title = "修改原始采集数据";
+      });
+    },
+    /** 提交按钮 */
+    submitForm() {
+      this.$refs["form"].validate(valid => {
+        if (valid) {
+          if (this.form.id != null) {
+            updateSpiderData(this.form).then(response => {
+              this.$modal.msgSuccess("修改成功");
+              this.open = false;
+              this.getList();
+            });
+          } else {
+            addSpiderData(this.form).then(response => {
+              this.$modal.msgSuccess("新增成功");
+              this.open = false;
+              this.getList();
+            });
+          }
+        }
+      });
+    },
+    /** 删除按钮操作 */
+    handleDelete(row) {
+      const ids = row.id || this.ids;
+      this.$modal.confirm('是否确认删除原始采集数据编号为"' + ids + '"的数据项?').then(function () {
+        return delSpiderData(ids);
+      }).then(() => {
+        this.getList();
+        this.$modal.msgSuccess("删除成功");
+      }).catch(() => {
+      });
+    },
+    /** 导出按钮操作 */
+    handleExport() {
+      this.download('spiderData/sourceData/export', {
+        ...this.queryParams
+      }, `spiderData_${new Date().getTime()}.xlsx`)
+    }
+  }
+};
+
+</script>
+
+<!--
+<style scoped>
+/* 新增:附件列表样式,避免布局错乱 */
+.attachment-item {
+  padding: 12px 0;
+  border-bottom: 1px solid #f5f5f5;
+}
+.attachment-item:last-child {
+  border-bottom: none;
+}
+.text-ellipsis {
+  display: block;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+}
+/* 修复:表格滚动容器样式 */
+.app-container {
+  padding: 16px;
+  box-sizing: border-box;
+}
+</style>
+ -->

+ 0 - 357
xzl-ui/src/views/spiderData/structured/index.vue

@@ -1,357 +0,0 @@
-<template>
-  <div class="app-container">
-    <el-form :model="queryParams" ref="queryForm" size="small" :inline="true" v-show="showSearch" label-width="68px">
-      <el-form-item label="文档标题" prop="title">
-        <el-input
-          v-model="queryParams.title"
-          placeholder="请输入文档标题"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item label="作者" prop="author">
-        <el-input
-          v-model="queryParams.author"
-          placeholder="请输入作者"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item label="所属部门" prop="department">
-        <el-input
-          v-model="queryParams.department"
-          placeholder="请输入所属部门"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item label="发布时间">
-        <el-date-picker
-          v-model="daterangePublishTime"
-          style="width: 240px"
-          value-format="yyyy-MM-dd"
-          type="daterange"
-          range-separator="-"
-          start-placeholder="开始日期"
-          end-placeholder="结束日期"
-        ></el-date-picker>
-      </el-form-item>
-      <el-form-item label="关键词" prop="keywords">
-        <el-input
-          v-model="queryParams.keywords"
-          placeholder="请输入关键词"
-          clearable
-          @keyup.enter.native="handleQuery"
-        />
-      </el-form-item>
-      <el-form-item>
-        <el-button type="primary" icon="el-icon-search" size="mini" @click="handleQuery">搜索</el-button>
-        <el-button icon="el-icon-refresh" size="mini" @click="resetQuery">重置</el-button>
-      </el-form-item>
-    </el-form>
-
-    <el-row :gutter="10" class="mb8">
-      <el-col :span="1.5">
-        <el-button
-          type="primary"
-          plain
-          icon="el-icon-plus"
-          size="mini"
-          @click="handleAdd"
-          v-hasPermi="['spiderData:structured:add']"
-        >新增</el-button>
-      </el-col>
-      <el-col :span="1.5">
-        <el-button
-          type="success"
-          plain
-          icon="el-icon-edit"
-          size="mini"
-          :disabled="single"
-          @click="handleUpdate"
-          v-hasPermi="['spiderData:structured:edit']"
-        >修改</el-button>
-      </el-col>
-      <el-col :span="1.5">
-        <el-button
-          type="danger"
-          plain
-          icon="el-icon-delete"
-          size="mini"
-          :disabled="multiple"
-          @click="handleDelete"
-          v-hasPermi="['spiderData:structured:remove']"
-        >删除</el-button>
-      </el-col>
-      <el-col :span="1.5">
-        <el-button
-          type="warning"
-          plain
-          icon="el-icon-download"
-          size="mini"
-          @click="handleExport"
-          v-hasPermi="['spiderData:structured:export']"
-        >导出</el-button>
-      </el-col>
-      <right-toolbar :showSearch.sync="showSearch" @queryTable="getList"></right-toolbar>
-    </el-row>
-
-    <el-table v-loading="loading" :data="structuredList" @selection-change="handleSelectionChange">
-      <el-table-column type="selection" width="55" align="center" />
-      <el-table-column label="结构化数据ID" align="center" prop="id" />
-      <el-table-column label="关联原始数据ID" align="center" prop="rawDataId" />
-      <el-table-column label="文档标题" align="center" prop="title" />
-      <el-table-column label="结构化文本内容" align="center" prop="content" />
-      <el-table-column label="作者" align="center" prop="author" />
-      <el-table-column label="所属部门" align="center" prop="department" />
-      <el-table-column label="发布时间" align="center" prop="publishTime" width="180">
-        <template slot-scope="scope">
-          <span>{{ parseTime(scope.row.publishTime, '{y}-{m}-{d}') }}</span>
-        </template>
-      </el-table-column>
-      <el-table-column label="关键词" align="center" prop="keywords" />
-      <el-table-column label="自定义字段" align="center" prop="customFields" />
-      <el-table-column label="处理时间" align="center" prop="processTime" width="180">
-        <template slot-scope="scope">
-          <span>{{ parseTime(scope.row.processTime, '{y}-{m}-{d}') }}</span>
-        </template>
-      </el-table-column>
-      <el-table-column label="操作" align="center" class-name="small-padding fixed-width">
-        <template slot-scope="scope">
-          <el-button
-            size="mini"
-            type="text"
-            icon="el-icon-edit"
-            @click="handleUpdate(scope.row)"
-            v-hasPermi="['spiderData:structured:edit']"
-          >修改</el-button>
-          <el-button
-            size="mini"
-            type="text"
-            icon="el-icon-delete"
-            @click="handleDelete(scope.row)"
-            v-hasPermi="['spiderData:structured:remove']"
-          >删除</el-button>
-        </template>
-      </el-table-column>
-    </el-table>
-    
-    <pagination
-      v-show="total>0"
-      :total="total"
-      :page.sync="queryParams.pageNum"
-      :limit.sync="queryParams.pageSize"
-      @pagination="getList"
-    />
-
-    <!-- 添加或修改结构化数据对话框 -->
-    <el-dialog :title="title" :visible.sync="open" width="500px" append-to-body>
-      <el-form ref="form" :model="form" :rules="rules" label-width="80px">
-        <el-form-item label="关联原始数据ID" prop="rawDataId">
-          <el-input v-model="form.rawDataId" placeholder="请输入关联原始数据ID" />
-        </el-form-item>
-        <el-form-item label="文档标题" prop="title">
-          <el-input v-model="form.title" placeholder="请输入文档标题" />
-        </el-form-item>
-        <el-form-item label="结构化文本内容">
-          <editor v-model="form.content" :min-height="192"/>
-        </el-form-item>
-        <el-form-item label="作者" prop="author">
-          <el-input v-model="form.author" placeholder="请输入作者" />
-        </el-form-item>
-        <el-form-item label="所属部门" prop="department">
-          <el-input v-model="form.department" placeholder="请输入所属部门" />
-        </el-form-item>
-        <el-form-item label="发布时间" prop="publishTime">
-          <el-date-picker clearable
-            v-model="form.publishTime"
-            type="date"
-            value-format="yyyy-MM-dd"
-            placeholder="请选择发布时间">
-          </el-date-picker>
-        </el-form-item>
-        <el-form-item label="关键词" prop="keywords">
-          <el-input v-model="form.keywords" placeholder="请输入关键词" />
-        </el-form-item>
-        <el-form-item label="自定义字段" prop="customFields">
-          <el-input v-model="form.customFields" type="textarea" placeholder="请输入内容" />
-        </el-form-item>
-        <el-form-item label="处理时间" prop="processTime">
-          <el-date-picker clearable
-            v-model="form.processTime"
-            type="date"
-            value-format="yyyy-MM-dd"
-            placeholder="请选择处理时间">
-          </el-date-picker>
-        </el-form-item>
-      </el-form>
-      <div slot="footer" class="dialog-footer">
-        <el-button type="primary" @click="submitForm">确 定</el-button>
-        <el-button @click="cancel">取 消</el-button>
-      </div>
-    </el-dialog>
-  </div>
-</template>
-
-<script>
-import { listStructured, getStructured, delStructured, addStructured, updateStructured } from "@/api/spiderData/structured";
-
-export default {
-  name: "Structured",
-  data() {
-    return {
-      // 遮罩层
-      loading: true,
-      // 选中数组
-      ids: [],
-      // 非单个禁用
-      single: true,
-      // 非多个禁用
-      multiple: true,
-      // 显示搜索条件
-      showSearch: true,
-      // 总条数
-      total: 0,
-      // 结构化数据表格数据
-      structuredList: [],
-      // 弹出层标题
-      title: "",
-      // 是否显示弹出层
-      open: false,
-      // 处理时间时间范围
-      daterangePublishTime: [],
-      // 查询参数
-      queryParams: {
-        pageNum: 1,
-        pageSize: 10,
-        title: null,
-        content: null,
-        author: null,
-        department: null,
-        publishTime: null,
-        keywords: null,
-      },
-      // 表单参数
-      form: {},
-      // 表单校验
-      rules: {
-        rawDataId: [
-          { required: true, message: "关联原始数据ID不能为空", trigger: "blur" }
-        ],
-        processTime: [
-          { required: true, message: "处理时间不能为空", trigger: "blur" }
-        ]
-      }
-    };
-  },
-  created() {
-    this.getList();
-  },
-  methods: {
-    /** 查询结构化数据列表 */
-    getList() {
-      this.loading = true;
-      this.queryParams.params = {};
-      if (null != this.daterangePublishTime && '' != this.daterangePublishTime) {
-        this.queryParams.params["beginPublishTime"] = this.daterangePublishTime[0];
-        this.queryParams.params["endPublishTime"] = this.daterangePublishTime[1];
-      }
-      listStructured(this.queryParams).then(response => {
-        this.structuredList = response.rows;
-        this.total = response.total;
-        this.loading = false;
-      });
-    },
-    // 取消按钮
-    cancel() {
-      this.open = false;
-      this.reset();
-    },
-    // 表单重置
-    reset() {
-      this.form = {
-        id: null,
-        rawDataId: null,
-        title: null,
-        content: null,
-        author: null,
-        department: null,
-        publishTime: null,
-        keywords: null,
-        customFields: null,
-        processTime: null
-      };
-      this.resetForm("form");
-    },
-    /** 搜索按钮操作 */
-    handleQuery() {
-      this.queryParams.pageNum = 1;
-      this.getList();
-    },
-    /** 重置按钮操作 */
-    resetQuery() {
-      this.daterangePublishTime = [];
-      this.resetForm("queryForm");
-      this.handleQuery();
-    },
-    // 多选框选中数据
-    handleSelectionChange(selection) {
-      this.ids = selection.map(item => item.id)
-      this.single = selection.length!==1
-      this.multiple = !selection.length
-    },
-    /** 新增按钮操作 */
-    handleAdd() {
-      this.reset();
-      this.open = true;
-      this.title = "添加结构化数据";
-    },
-    /** 修改按钮操作 */
-    handleUpdate(row) {
-      this.reset();
-      const id = row.id || this.ids
-      getStructured(id).then(response => {
-        this.form = response.data;
-        this.open = true;
-        this.title = "修改结构化数据";
-      });
-    },
-    /** 提交按钮 */
-    submitForm() {
-      this.$refs["form"].validate(valid => {
-        if (valid) {
-          if (this.form.id != null) {
-            updateStructured(this.form).then(response => {
-              this.$modal.msgSuccess("修改成功");
-              this.open = false;
-              this.getList();
-            });
-          } else {
-            addStructured(this.form).then(response => {
-              this.$modal.msgSuccess("新增成功");
-              this.open = false;
-              this.getList();
-            });
-          }
-        }
-      });
-    },
-    /** 删除按钮操作 */
-    handleDelete(row) {
-      const ids = row.id || this.ids;
-      this.$modal.confirm('是否确认删除结构化数据编号为"' + ids + '"的数据项?').then(function() {
-        return delStructured(ids);
-      }).then(() => {
-        this.getList();
-        this.$modal.msgSuccess("删除成功");
-      }).catch(() => {});
-    },
-    /** 导出按钮操作 */
-    handleExport() {
-      this.download('spiderData/structured/export', {
-        ...this.queryParams
-      }, `structured_${new Date().getTime()}.xlsx`)
-    }
-  }
-};
-</script>