From be3ae0be81d75cc06ea9b40f3ea0c7a7e468a346 Mon Sep 17 00:00:00 2001 From: xiaoCJ <406612557@qq.com> Date: Sun, 25 Jun 2023 17:17:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8E=BB=E9=99=A4=E5=89=8D=E7=AB=AF=E6=A0=87?= =?UTF-8?q?=E7=AD=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../business/service/DocInfoService.java | 74 +++++++++++++------ .../com/ruoyi/biemo/utils/FormatUtil.java | 59 +++++++++++++++ 2 files changed, 112 insertions(+), 21 deletions(-) create mode 100644 ruoyi-biemo/src/main/java/com/ruoyi/biemo/utils/FormatUtil.java diff --git a/ruoyi-biemo/src/main/java/com/ruoyi/biemo/business/service/DocInfoService.java b/ruoyi-biemo/src/main/java/com/ruoyi/biemo/business/service/DocInfoService.java index 8ea237d..a6922a8 100644 --- a/ruoyi-biemo/src/main/java/com/ruoyi/biemo/business/service/DocInfoService.java +++ b/ruoyi-biemo/src/main/java/com/ruoyi/biemo/business/service/DocInfoService.java @@ -14,6 +14,7 @@ import com.ruoyi.biemo.mongodb.utils.MongoHelper; import com.ruoyi.biemo.nlp.DependencyParserUtils; import com.ruoyi.biemo.nlp.SentimentAnalysisUtils; import com.ruoyi.biemo.nlp.SummaryUtils; +import com.ruoyi.biemo.utils.FormatUtil; import com.ruoyi.biemo.utils.MyObjects; import com.ruoyi.common.core.domain.AjaxResult; import com.ruoyi.common.utils.StringUtils; @@ -29,10 +30,12 @@ import org.springframework.context.event.EventListener; import org.springframework.stereotype.Service; import javax.print.Doc; +import java.io.IOException; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * @author makesoft @@ -450,36 +453,53 @@ public class DocInfoService extends EsService { public List getWordCloudByCateId(String categoryId) { String regex = "<.*?>"; // 匹配HTML标签的正则表达式 + String regEx="[\n`~1234567890!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?\n" + + "  ]"; Map temp = new ConcurrentHashMap<>(); List wordCloudItemList = new ArrayList<>(); DocInfo docInfo = new DocInfo(); docInfo.setCateId(categoryId); List docInfoList = selectDocInfoList(docInfo); - if (CollectionUtils.isNotEmpty(docInfoList)) { - List termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,,  '“”.。]", "").trim()).stream()).collect(Collectors.toList()); - if (CollectionUtils.isNotEmpty(termList)) { - termList.parallelStream().forEach(term -> { - String word = term.word; - Integer value = term.getFrequency(); - if (!temp.containsKey(word)) { - temp.put(word, 1); - } else { - temp.put(word, temp.get(word) + 1); - } - }); + List termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> { + try { + return NLPTokenizer. + segment(FormatUtil.RemovalOfStopWords(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll(regEx, "").trim()) + ).stream(); + } catch (IOException e) { + e.printStackTrace(); + return Stream.empty(); } + }).collect(Collectors.toList()); + if (CollectionUtils.isNotEmpty(termList)) { + termList.parallelStream().filter(t->t.word.length()>1).forEach(term -> { + String word = term.word; + Integer value = term.getFrequency(); + if (!temp.containsKey(word)) { + temp.put(word, 1); + } else { + temp.put(word, temp.get(word) + 1); + } + }); } - for (Map.Entry entry : temp.entrySet()) { - WordCloudItem wordCloudItem = new WordCloudItem(); - wordCloudItem.setName(entry.getKey()); - wordCloudItem.setValue(entry.getValue()); - wordCloudItemList.add(wordCloudItem); - } - return wordCloudItemList; + + for( + Map.Entry entry :temp.entrySet()) + + { + WordCloudItem wordCloudItem = new WordCloudItem(); + wordCloudItem.setName(entry.getKey()); + wordCloudItem.setValue(entry.getValue()); + wordCloudItemList.add(wordCloudItem); } + return wordCloudItemList; +} + + + public EmotionResult getEmotionAnalysis(String categoryId) { String regex = "<.*?>"; // 匹配HTML标签的正则表达式 + String regEx="[\n`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?]"; Map temp = new ConcurrentHashMap<>(); // List emotionResultItemList = new ArrayList<>(); EmotionResult emotionResult1 = new EmotionResult(); @@ -487,9 +507,19 @@ public class DocInfoService extends EsService { docInfo.setCateId(categoryId); List docInfoList = selectDocInfoList(docInfo); if (CollectionUtils.isNotEmpty(docInfoList)) { - List termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,,  '“”.。]", "").trim()).stream()).collect(Collectors.toList()); +// List termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,,  '“”.。]", "").trim()).stream()).collect(Collectors.toList()); + List termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> { + try { + return NLPTokenizer. + segment(FormatUtil.RemovalOfStopWords(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll(regEx, "").trim()) + ).stream(); + } catch (IOException e) { + e.printStackTrace(); + return Stream.empty(); + } + }).collect(Collectors.toList()); if (CollectionUtils.isNotEmpty(termList)) { - termList.parallelStream().forEach(term -> { + termList.parallelStream().filter(t->t.word.length()>1).forEach(term -> { String word = term.word; Integer value = term.getFrequency(); if (!temp.containsKey(word)) { @@ -500,6 +530,8 @@ public class DocInfoService extends EsService { }); } } + + int count = 0; int count2 = 0; for (Map.Entry entry : temp.entrySet()) { diff --git a/ruoyi-biemo/src/main/java/com/ruoyi/biemo/utils/FormatUtil.java b/ruoyi-biemo/src/main/java/com/ruoyi/biemo/utils/FormatUtil.java new file mode 100644 index 0000000..c70c6cd --- /dev/null +++ b/ruoyi-biemo/src/main/java/com/ruoyi/biemo/utils/FormatUtil.java @@ -0,0 +1,59 @@ +package com.ruoyi.biemo.utils; + + +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.seg.common.Term; + + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class FormatUtil { + + /** + * 去除停用词 + * @param oldString:原中文文本 + * @return 去除停用词之后的中文文本 + * @throws IOException + */ + public static String RemovalOfStopWords(String oldString) throws IOException { + String newString = oldString; + + // 分词 + List termList = HanLP.segment(newString); + System.out.println(termList); + + + // 中文 停用词 .txt 文件路径 + String filePath = "D:\\停用词.txt"; + File file = new File(filePath); + + BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); + List stopWords = new ArrayList<>(); + String temp = null; + while ((temp = bufferedReader.readLine()) != null) { + //System.out.println("*" + temp+ "*"); + stopWords.add(temp.trim()); + } + + List termStringList = new ArrayList<>(); + for(Term term:termList) { + termStringList.add(term.word); + //System.out.println("*" + term.word + "*"); + } + + termStringList.removeAll(stopWords); + + newString = ""; + for (String string:termStringList) { + newString += string; + } + + return newString; + } + +} \ No newline at end of file