去除前端标签

master
xiaoCJ 2 years ago
parent 30eee5ac36
commit be3ae0be81

@ -14,6 +14,7 @@ import com.ruoyi.biemo.mongodb.utils.MongoHelper;
import com.ruoyi.biemo.nlp.DependencyParserUtils;
import com.ruoyi.biemo.nlp.SentimentAnalysisUtils;
import com.ruoyi.biemo.nlp.SummaryUtils;
import com.ruoyi.biemo.utils.FormatUtil;
import com.ruoyi.biemo.utils.MyObjects;
import com.ruoyi.common.core.domain.AjaxResult;
import com.ruoyi.common.utils.StringUtils;
@ -29,10 +30,12 @@ import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Service;
import javax.print.Doc;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* @author makesoft
@ -450,36 +453,53 @@ public class DocInfoService extends EsService<DocInfo> {
public List<WordCloudItem> getWordCloudByCateId(String categoryId) {
String regex = "<.*?>"; // 匹配HTML标签的正则表达式
String regEx="[\n`~1234567890!@#$%^&*()+=|{}':;',\\[\\].<>/?~@#¥%……&*()——+|{}【】‘;:”“’。, 、?\n" +
"  ]";
Map<String, Integer> temp = new ConcurrentHashMap<>();
List<WordCloudItem> wordCloudItemList = new ArrayList<>();
DocInfo docInfo = new DocInfo();
docInfo.setCateId(categoryId);
List<DocInfo> docInfoList = selectDocInfoList(docInfo);
if (CollectionUtils.isNotEmpty(docInfoList)) {
List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,  '“”.。]", "").trim()).stream()).collect(Collectors.toList());
if (CollectionUtils.isNotEmpty(termList)) {
termList.parallelStream().forEach(term -> {
String word = term.word;
Integer value = term.getFrequency();
if (!temp.containsKey(word)) {
temp.put(word, 1);
} else {
temp.put(word, temp.get(word) + 1);
}
});
List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> {
try {
return NLPTokenizer.
segment(FormatUtil.RemovalOfStopWords(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll(regEx, "").trim())
).stream();
} catch (IOException e) {
e.printStackTrace();
return Stream.empty();
}
}).collect(Collectors.toList());
if (CollectionUtils.isNotEmpty(termList)) {
termList.parallelStream().filter(t->t.word.length()>1).forEach(term -> {
String word = term.word;
Integer value = term.getFrequency();
if (!temp.containsKey(word)) {
temp.put(word, 1);
} else {
temp.put(word, temp.get(word) + 1);
}
});
}
for (Map.Entry<String, Integer> entry : temp.entrySet()) {
WordCloudItem wordCloudItem = new WordCloudItem();
wordCloudItem.setName(entry.getKey());
wordCloudItem.setValue(entry.getValue());
wordCloudItemList.add(wordCloudItem);
}
return wordCloudItemList;
for(
Map.Entry<String, Integer> entry :temp.entrySet())
{
WordCloudItem wordCloudItem = new WordCloudItem();
wordCloudItem.setName(entry.getKey());
wordCloudItem.setValue(entry.getValue());
wordCloudItemList.add(wordCloudItem);
}
return wordCloudItemList;
}
public EmotionResult getEmotionAnalysis(String categoryId) {
String regex = "<.*?>"; // 匹配HTML标签的正则表达式
String regEx="[\n`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~@#¥%……&*()——+|{}【】‘;:”“’。, 、?]";
Map<String, Integer> temp = new ConcurrentHashMap<>();
// List<EmotionResult> emotionResultItemList = new ArrayList<>();
EmotionResult emotionResult1 = new EmotionResult();
@ -487,9 +507,19 @@ public class DocInfoService extends EsService<DocInfo> {
docInfo.setCateId(categoryId);
List<DocInfo> docInfoList = selectDocInfoList(docInfo);
if (CollectionUtils.isNotEmpty(docInfoList)) {
List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,  '“”.。]", "").trim()).stream()).collect(Collectors.toList());
// List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> NLPTokenizer.segment(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll("[,  '“”.。]", "").trim()).stream()).collect(Collectors.toList());
List<Term> termList = docInfoList.parallelStream().filter(ObjectUtils::isNotEmpty).flatMap(_docInfo -> {
try {
return NLPTokenizer.
segment(FormatUtil.RemovalOfStopWords(_docInfo.getContent().replaceAll(regex, "").replaceAll("\\s+", "").replaceAll(regEx, "").trim())
).stream();
} catch (IOException e) {
e.printStackTrace();
return Stream.empty();
}
}).collect(Collectors.toList());
if (CollectionUtils.isNotEmpty(termList)) {
termList.parallelStream().forEach(term -> {
termList.parallelStream().filter(t->t.word.length()>1).forEach(term -> {
String word = term.word;
Integer value = term.getFrequency();
if (!temp.containsKey(word)) {
@ -500,6 +530,8 @@ public class DocInfoService extends EsService<DocInfo> {
});
}
}
int count = 0;
int count2 = 0;
for (Map.Entry<String, Integer> entry : temp.entrySet()) {

@ -0,0 +1,59 @@
package com.ruoyi.biemo.utils;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class FormatUtil {
/**
*
* @param oldString
* @return
* @throws IOException
*/
public static String RemovalOfStopWords(String oldString) throws IOException {
String newString = oldString;
// 分词
List<Term> termList = HanLP.segment(newString);
System.out.println(termList);
// 中文 停用词 .txt 文件路径
String filePath = "D:\\停用词.txt";
File file = new File(filePath);
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
List<String> stopWords = new ArrayList<>();
String temp = null;
while ((temp = bufferedReader.readLine()) != null) {
//System.out.println("*" + temp+ "*");
stopWords.add(temp.trim());
}
List<String> termStringList = new ArrayList<>();
for(Term term:termList) {
termStringList.add(term.word);
//System.out.println("*" + term.word + "*");
}
termStringList.removeAll(stopWords);
newString = "";
for (String string:termStringList) {
newString += string;
}
return newString;
}
}
Loading…
Cancel
Save