去除前端标签

2 years ago · be3ae0be81
parent 30eee5ac36
commit be3ae0be81
2 changed files with 112 additions and 21 deletions
--- a/ruoyi-biemo/src/main/java/com/ruoyi/biemo/business/service/DocInfoService.java
+++ b/ruoyi-biemo/src/main/java/com/ruoyi/biemo/business/service/DocInfoService.java
@ -14,6 +14,7 @@ import com.ruoyi.biemo.mongodb.utils.MongoHelper;
 import com.ruoyi.biemo.nlp.DependencyParserUtils;
 import com.ruoyi.biemo.nlp.SentimentAnalysisUtils;
 import com.ruoyi.biemo.nlp.SummaryUtils;
+import com.ruoyi.biemo.utils.FormatUtil;
 import com.ruoyi.biemo.utils.MyObjects;
 import com.ruoyi.common.core.domain.AjaxResult;
 import com.ruoyi.common.utils.StringUtils;
@ -29,10 +30,12 @@ import org.springframework.context.event.EventListener;
 import org.springframework.stereotype.Service;

 import javax.print.Doc;
+import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;

 /**
 * @author makesoft
@ -450,36 +453,53 @@ public class DocInfoService extends EsService<DocInfo> {

    public List<WordCloudItem> getWordCloudByCateId(String categoryId) {
        String regex = "<.*?>"; // 匹配HTML标签的正则表达式
+        String regEx="[\n`~1234567890!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。， 、？\n" +
+                "　　]";
        Map<String, Integer> temp = new ConcurrentHashMap<>();
        List<WordCloudItem> wordCloudItemList = new ArrayList<>();
        DocInfo docInfo = new DocInfo();
        docInfo.setCateId(categoryId);
        List<DocInfo> docInfoList = selectDocInfoList(docInfo);
-        if (CollectionUtils.isNotEmpty(docInfoList)) {
-            List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,，　　'“”.。]", "").trim()).stream()).collect(Collectors.toList());
-            if (CollectionUtils.isNotEmpty(termList)) {
-                termList.parallelStream().forEach(term -> {
-                    String word = term.word;
-                    Integer value = term.getFrequency();
-                    if (!temp.containsKey(word)) {
-                        temp.put(word, 1);
-                    } else {
-                        temp.put(word, temp.get(word) + 1);
-                    }
-                });
+        List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> {
+            try {
+                return NLPTokenizer.
+                        segment(FormatUtil.RemovalOfStopWords(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll(regEx, "").trim())
+                        ).stream();
+            } catch (IOException e) {
+                e.printStackTrace();
+                return Stream.empty();
            }
+        }).collect(Collectors.toList());
+        if (CollectionUtils.isNotEmpty(termList)) {
+            termList.parallelStream().filter(t->t.word.length()>1).forEach(term -> {
+                String word = term.word;
+                Integer value = term.getFrequency();
+                if (!temp.containsKey(word)) {
+                    temp.put(word, 1);
+                } else {
+                    temp.put(word, temp.get(word) + 1);
+                }
+            });
        }
-        for (Map.Entry<String, Integer> entry : temp.entrySet()) {
-            WordCloudItem wordCloudItem = new WordCloudItem();
-            wordCloudItem.setName(entry.getKey());
-            wordCloudItem.setValue(entry.getValue());
-            wordCloudItemList.add(wordCloudItem);
-        }
-        return wordCloudItemList;
+
+        for(
+    Map.Entry<String, Integer> entry :temp.entrySet())
+
+    {
+        WordCloudItem wordCloudItem = new WordCloudItem();
+        wordCloudItem.setName(entry.getKey());
+        wordCloudItem.setValue(entry.getValue());
+        wordCloudItemList.add(wordCloudItem);
    }
+        return wordCloudItemList;
+}
+
+
+

    public EmotionResult getEmotionAnalysis(String categoryId) {
        String regex = "<.*?>"; // 匹配HTML标签的正则表达式
+        String regEx="[\n`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。， 、？]";
        Map<String, Integer> temp = new ConcurrentHashMap<>();
 //        List<EmotionResult> emotionResultItemList = new ArrayList<>();
        EmotionResult emotionResult1 = new EmotionResult();
@ -487,9 +507,19 @@ public class DocInfoService extends EsService<DocInfo> {
        docInfo.setCateId(categoryId);
        List<DocInfo> docInfoList = selectDocInfoList(docInfo);
        if (CollectionUtils.isNotEmpty(docInfoList)) {
-            List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,，　　'“”.。]", "").trim()).stream()).collect(Collectors.toList());
+//            List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,，　　'“”.。]", "").trim()).stream()).collect(Collectors.toList());
+            List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> {
+                try {
+                    return NLPTokenizer.
+                            segment(FormatUtil.RemovalOfStopWords(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll(regEx, "").trim())
+                            ).stream();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                    return Stream.empty();
+                }
+            }).collect(Collectors.toList());
            if (CollectionUtils.isNotEmpty(termList)) {
-                termList.parallelStream().forEach(term -> {
+                termList.parallelStream().filter(t->t.word.length()>1).forEach(term -> {
                    String word = term.word;
                    Integer value = term.getFrequency();
                    if (!temp.containsKey(word)) {
@ -500,6 +530,8 @@ public class DocInfoService extends EsService<DocInfo> {
                });
            }
        }
+
+
        int count = 0;
        int count2 = 0;
        for (Map.Entry<String, Integer> entry : temp.entrySet()) {
--- a/ruoyi-biemo/src/main/java/com/ruoyi/biemo/utils/FormatUtil.java
+++ b/ruoyi-biemo/src/main/java/com/ruoyi/biemo/utils/FormatUtil.java
@ -0,0 +1,59 @@
+package com.ruoyi.biemo.utils;
+
+
+import com.hankcs.hanlp.HanLP;
+import com.hankcs.hanlp.seg.common.Term;
+
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class FormatUtil {
+ 
+    /**
+     * 去除停用词
+     * @param oldString：原中文文本
+     * @return 去除停用词之后的中文文本
+     * @throws IOException
+     */
+    public static String RemovalOfStopWords(String oldString) throws IOException {
+        String newString = oldString;
+ 
+        // 分词
+        List<Term> termList = HanLP.segment(newString);
+        System.out.println(termList);
+ 
+ 
+        // 中文 停用词 .txt 文件路径
+        String filePath = "D:\\停用词.txt";
+        File file = new File(filePath);
+ 
+        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
+        List<String> stopWords = new ArrayList<>();
+        String temp = null;
+        while ((temp = bufferedReader.readLine()) != null) {
+            //System.out.println("*" + temp+ "*");
+            stopWords.add(temp.trim());
+        }
+ 
+        List<String> termStringList = new ArrayList<>();
+        for(Term term:termList) {
+            termStringList.add(term.word);
+            //System.out.println("*" + term.word + "*");
+        }
+ 
+        termStringList.removeAll(stopWords);
+ 
+        newString = "";
+        for (String string:termStringList) {
+            newString += string;
+        }
+ 
+        return newString;
+    }
+ 
+}