词性标注优化

master
hujunbo 2 years ago
parent 350a61aece
commit 0ee2c65c74

@ -1,17 +1,21 @@
package com.ruoyi.biemo.business.controller;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.hankcs.hanlp.corpus.document.sentence.Sentence;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.ruoyi.biemo.business.domain.Category;
import com.ruoyi.biemo.business.domain.DocInfo;
import com.ruoyi.biemo.business.service.CategoryService;
import com.ruoyi.biemo.business.service.DocInfoService;
import com.ruoyi.biemo.core.page.Page;
import com.ruoyi.biemo.core.page.PageFactory;
import com.ruoyi.biemo.nlp.DependencyParserUtils;
import com.ruoyi.biemo.utils.MyObjects;
import com.ruoyi.common.annotation.Log;
import com.ruoyi.common.core.controller.BaseController;
import com.ruoyi.common.core.domain.AjaxResult;
import com.ruoyi.common.core.page.TableDataInfo;
import com.ruoyi.common.enums.BusinessType;
import com.ruoyi.common.utils.poi.ExcelUtil;
import org.springframework.beans.factory.annotation.Autowired;
@ -19,7 +23,10 @@ import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
@ -81,14 +88,68 @@ public class DocInfoController extends BaseController {
*/
@PreAuthorize("@ss.hasPermi('biemo:docInfo:query')")
@GetMapping(value = "/getInfoDelHtml/{id}")
public AjaxResult getInfoDelHtml(@PathVariable("id") String id)
{
DocInfo docInfo = docInfoService.selectDocInfoById(id);
public AjaxResult getInfoDelHtml(@PathVariable("id") String docInfoId) throws IOException {
DocInfo docInfo = docInfoService.selectDocInfoById(docInfoId);
JSONObject result = new JSONObject();
String content = docInfo.getContent();
content = MyObjects.delHTMLTag(content);
content = MyObjects.delSpace(content);
docInfo.setContent(content);
return AjaxResult.success(docInfo);
JSONArray labels = new JSONArray();
result.put("content",content);
result.put("connections",new JSONArray());
result.put("connectionCategories",MyObjects.connectionCategories);
result.put("labelCategories",MyObjects.labelCategories);
Sentence sentence = DependencyParserUtils.nerAnalysis(content);
List<IWord> wordList = sentence.wordList;
if(wordList!=null&&wordList.size()>0){
int offset = 0;
int id = 0;
for(int i=0;i<wordList.size();i++){
String value = wordList.get(i).getValue();
String label = wordList.get(i).getLabel();
if(MyObjects.wordsMapping.get(label)!=null){
Integer startIndex = offset;
Integer endIndex = offset+value.length();
Integer categoryId = MyObjects.wordsMapping.get(label);
JSONObject jsonObject = new JSONObject();
jsonObject.put("id",id);
jsonObject.put("categoryId",categoryId);
jsonObject.put("startIndex",startIndex);
jsonObject.put("endIndex",endIndex);
labels.add(jsonObject);
}
offset += value.length();
id++;
}
result.put("labels",labels);
}
return AjaxResult.success("查询成功",result);
}
@PostMapping("/uploadPosTag")
public AjaxResult uploadPosTag(@RequestBody JSONObject jsonObject) throws IOException {
System.out.println(jsonObject);
String content = jsonObject.getString("content");
JSONArray labels = jsonObject.getJSONArray("labels");
List<IWord> wordList = new ArrayList<>();
if(labels!=null&&labels.size()>0){
for(int i=0;i<labels.size();i++){
Word word = new Word(null,null);
JSONObject labelJSONObject = labels.getJSONObject(i);
Integer startIndex = labelJSONObject.getInteger("startIndex");
Integer endIndex = labelJSONObject.getInteger("endIndex");
Integer categoryId = labelJSONObject.getInteger("categoryId");
String value = content.substring(startIndex,endIndex);
String label = MyObjects.wordsMappingIndex.get(categoryId);
word.setLabel(label);
word.setValue(value);
wordList.add(word);
}
Sentence sentence = new Sentence(wordList);
DependencyParserUtils.learn(sentence);
}
return AjaxResult.success();
}
/**

@ -34,6 +34,28 @@ public class DependencyParserUtils {
return analyzer.analyze(text);
}
//上传词性
public static Boolean learn(Sentence sentence) throws IOException {
PerceptronLexicalAnalyzer analyzer = new PerceptronLexicalAnalyzer(HanLP.Config.PerceptronCWSModelPath,
HanLP.Config.PerceptronPOSModelPath,
HanLP.Config.PerceptronNERModelPath);
Boolean result = analyzer.learn(sentence);
if(result){
analyzer.getPerceptronSegmenter().getModel().save(HanLP.Config.PerceptronCWSModelPath);
analyzer.getPerceptronPOSTagger().getModel().save(HanLP.Config.PerceptronPOSModelPath);
analyzer.getPerceptionNERecognizer().getModel().save(HanLP.Config.PerceptronNERModelPath);
}
return result;
}
public static void main(String[] args) throws Exception{
String text = "里根号核动力航空母舰驶入南海,里根号他的船很大,极大的像一只傻逼。我爱北京天安门,中国万岁";
System.out.println(nerAnalysis(text).translateLabels());
//analyzer.getPerceptronPOSTagger().getModel().save(HanLP.Config.PerceptronPOSModelPath);
//analyzer.learn("与/c 特朗普/nr 通/v 电话/n 讨论/v");
}
public static Map<String,Set<String>> getMyNERTagSet(String[] strArr) throws Exception{
Map<String,Set<String>> result = new HashMap<>();
if(strArr!=null&&strArr.length>0){

@ -1,14 +1,100 @@
package com.ruoyi.biemo.utils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang3.StringUtils;
import java.lang.reflect.Field;
import java.lang.reflect.Type;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MyObjects {
public static Map<String,Integer> wordsMapping = new HashMap<>();
public static Map<Integer,String> wordsMappingIndex = new HashMap<>();
public static JSONArray labelCategories = new JSONArray();
public static JSONArray connectionCategories = new JSONArray();
static{
wordsMapping.put("n",0);//名词
wordsMapping.put("nr",1);// 人名
wordsMapping.put("ns",2);// 地名
wordsMapping.put("nt",3);// 机构团体
wordsMapping.put("v",4);//动词
wordsMapping.put("a",5);//形容词
wordsMapping.put("d",6);//副词
wordsMappingIndex.put(0,"n");//名词
wordsMappingIndex.put(1,"nr");// 人名
wordsMappingIndex.put(2,"ns");// 地名
wordsMappingIndex.put(3,"nt");// 机构团体
wordsMappingIndex.put(4,"v");//动词
wordsMappingIndex.put(5,"a");//形容词
wordsMappingIndex.put(6,"d");//副词
JSONObject labelCategoriesMap1 = new JSONObject();
labelCategoriesMap1.put("id",0);
labelCategoriesMap1.put("text","名词");
labelCategoriesMap1.put("color","#eac0a2");
labelCategoriesMap1.put("border-color","#8c7361");
labelCategories.add(labelCategoriesMap1);
JSONObject labelCategoriesMap2 = new JSONObject();
labelCategoriesMap2.put("id",1);
labelCategoriesMap2.put("text","人名");
labelCategoriesMap2.put("color","#eac0a2");
labelCategoriesMap2.put("border-color","#8c7361");
labelCategories.add(labelCategoriesMap2);
JSONObject labelCategoriesMap3 = new JSONObject();
labelCategoriesMap3.put("id",2);
labelCategoriesMap3.put("text","地名");
labelCategoriesMap3.put("color","#eac0a2");
labelCategoriesMap3.put("border-color","#8c7361");
labelCategories.add(labelCategoriesMap3);
JSONObject labelCategoriesMap4 = new JSONObject();
labelCategoriesMap4.put("id",3);
labelCategoriesMap4.put("text","机构团体");
labelCategoriesMap4.put("color","#eac0a2");
labelCategoriesMap4.put("border-color","#8c7361");
labelCategories.add(labelCategoriesMap4);
JSONObject labelCategoriesMap5 = new JSONObject();
labelCategoriesMap5.put("id",4);
labelCategoriesMap5.put("text","动词");
labelCategoriesMap5.put("color","#619dff");
labelCategoriesMap5.put("border-color","#3c619d");
labelCategories.add(labelCategoriesMap5);
JSONObject labelCategoriesMap6 = new JSONObject();
labelCategoriesMap6.put("id",5);
labelCategoriesMap6.put("text","形容词");
labelCategoriesMap6.put("color","#9d61ff");
labelCategoriesMap6.put("border-color","#613C9D");
labelCategories.add(labelCategoriesMap6);
JSONObject labelCategoriesMap7 = new JSONObject();
labelCategoriesMap7.put("id",6);
labelCategoriesMap7.put("text","副词");
labelCategoriesMap7.put("color","#ff9d61");
labelCategoriesMap7.put("border-color","#995e3a");
labelCategories.add(labelCategoriesMap7);
JSONObject connectionCategoriesMap1 = new JSONObject();
connectionCategoriesMap1.put("id",0);
connectionCategoriesMap1.put("text","修饰");
connectionCategories.add(connectionCategoriesMap1);
JSONObject connectionCategoriesMap2 = new JSONObject();
connectionCategoriesMap2.put("id",1);
connectionCategoriesMap2.put("text","限定");
connectionCategories.add(connectionCategoriesMap2);
JSONObject connectionCategoriesMap3 = new JSONObject();
connectionCategoriesMap3.put("id",2);
connectionCategoriesMap3.put("text","是...的动作");
connectionCategories.add(connectionCategoriesMap3);
}
/**
*
* ps: booleanfalse null

@ -11,10 +11,15 @@
package com.hankcs.demo;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.Sentence;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer;
import com.hankcs.hanlp.utility.TestUtility;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
@ -36,12 +41,19 @@ public class DemoPerceptronLexicalAnalyzer extends TestUtility
//System.out.println(analyzer.analyze("微软公司於1975年由比爾·蓋茲和保羅·艾倫創立18年啟動以智慧雲端、前端為導向的大改組。").translateLabels());
// 任何模型总会有失误特别是98年这种陈旧的语料库
System.out.println(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司").translateLabels());
System.out.println(analyzer.seg2sentence("总统普京与特朗普通电话讨论太空探索技术公司"));
System.out.println(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司,麻皮粗壮是我的名字"));
//System.out.println(analyzer.seg2sentence("总统普京与特朗普通电话讨论太空探索技术公司"));
// 支持在线学习
//analyzer.learn("与/c 特朗普/nr 通/v 电话/n 讨论/v [太空/s 探索/vn 技术/n 公司/n]/nt");
List<IWord> wordList = new ArrayList<>();
Word word = new Word(null,null);
word.setLabel("nt");
word.setValue("俄乌冲突");
wordList.add(word);
Sentence sentence = new Sentence(wordList);
analyzer.learn(sentence);
// 学习到新知识
System.out.println(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司"));
System.out.println(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司,麻皮粗壮是我的名字"));
// 还可以举一反三
//System.out.println(analyzer.analyze("主席和特朗普通电话"));
@ -51,8 +63,9 @@ public class DemoPerceptronLexicalAnalyzer extends TestUtility
//System.out.println(analyzer.analyze("我在四川金华出生,我的名字叫金华"));
// 在线学习后的模型支持序列化,以分词模型为例:
//analyzer.getPerceptronSegmenter().getModel().save("data/model/perceptron/pku199801/cws.bin");
//analyzer.getPerceptronPOSTagger().getModel().save(HanLP.Config.PerceptronPOSModelPath);
analyzer.getPerceptronSegmenter().getModel().save(HanLP.Config.PerceptronCWSModelPath);
analyzer.getPerceptronPOSTagger().getModel().save(HanLP.Config.PerceptronPOSModelPath);
analyzer.getPerceptionNERecognizer().getModel().save(HanLP.Config.PerceptronNERModelPath);
// 请用户按需执行对空格制表符等的预处理,只有你最清楚自己的文本中都有些什么奇怪的东西
// System.out.println(analyzer.analyze("空格 \t\n\r\f&nbsp;统统都不要"
// .replaceAll("\\s+", "") // 去除所有空白符

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

@ -16,14 +16,21 @@ export function getDocInfo(id) {
method: 'get'
})
}
// 查询文章管理详细
// 查询文章详情 组装词性标注
export function getInfoDelHtml(id) {
return request({
url: '/makesoft/docInfo/getInfoDelHtml/' + id,
method: 'get'
})
}
// 上传词性标注
export function uploadPosTag(data) {
return request({
url: '/makesoft/docInfo/uploadPosTag/',
method: 'post',
data: data
})
}
// 新增文章管理
export function addDocInfo(data) {
return request({

@ -98,7 +98,8 @@ Vue.config.productionTip = false
import {setToken} from '@/utils/auth';
import {ticket2token} from '@/api/login'
import {getQueryObject} from '@/utils/index';
let ticket = getQueryObject().ticket;
//let ticket = getQueryObject().ticket;
let ticket = null;
if(ticket){
ticket2token({ticket:ticket}).then(res =>{
let sysToken = res.data;

@ -8,7 +8,7 @@ import { isRelogin } from '@/utils/request'
NProgress.configure({ showSpinner: false })
const whiteList = ['/login', '/auth-redirect', '/bind', '/register']
const whiteList = ['/login', '/auth-redirect', '/bind', '/register','/review']
router.beforeEach((to, from, next) => {
NProgress.start()

@ -46,6 +46,11 @@ export const constantRoutes = [
component: () => import('@/views/login'),
hidden: true
},
{
path: '/review',
component: () => import('@/views/review'),
hidden: true
},
{
path: '/register',
component: () => import('@/views/register'),

@ -1,5 +1,5 @@
<template>
<div class="demo">
<div class="demo" v-loading="loading">
<div class="content">
<div class="left">
<div class="svgContainer" ref="svgContainer"></div>
@ -89,7 +89,7 @@
</v-card-title>
<v-card-text>
<v-radio-group v-model="selectedLabelCategory">
<v-radio v-for="category in this.labelCategories"
<v-radio v-for="category in this.labelCategoryRepo"
:key="category.id"
:label="category.text"
:value="category.id"></v-radio>
@ -112,7 +112,7 @@
</v-card-title>
<v-card-text>
<v-radio-group v-model="selectedConnectionCategory">
<v-radio v-for="category in this.connectionCategories"
<v-radio v-for="category in this.connectionCategoryRepo"
:key="category.id"
:label="category.text"
:value="category.id"></v-radio>
@ -146,31 +146,49 @@
import {Action, Annotator} from 'poplar-annotation'
import hljs from 'highlight.js'
//const defaultJson = "{\"content\":\"宿椿广\",\"labelCategories\":[{\"id\":\"0\",\"text\":\"\",\"color\":\"#eac0a2\",\"border-color\":\"#8c7361\"},{\"id\":\"1\",\"text\":\"\",\"color\":\"#619dff\",\"border-color\":\"#3c619d\"},{\"id\":\"2\",\"text\":\"\",\"color\":\"#9d61ff\",\"border-color\":\"#613C9D\"},{\"id\":\"3\",\"text\":\"\",\"color\":\"#ff9d61\",\"border-color\":\"#995e3a\"}],\"labels\":[{\"id\":\"0\",\"categoryId\":\"0\",\"startIndex\":0,\"endIndex\":2},{\"id\":\"1\",\"categoryId\":\"0\",\"startIndex\":3,\"endIndex\":4},{\"id\":\"2\",\"categoryId\":\"0\",\"startIndex\":216,\"endIndex\":217},{\"id\":\"3\",\"categoryId\":\"2\",\"startIndex\":217,\"endIndex\":218},{\"id\":\"4\",\"categoryId\":\"0\",\"startIndex\":219,\"endIndex\":220},{\"id\":\"5\",\"categoryId\":\"2\",\"startIndex\":220,\"endIndex\":221},{\"id\":\"6\",\"categoryId\":\"0\",\"startIndex\":32,\"endIndex\":33},{\"id\":\"7\",\"categoryId\":\"1\",\"startIndex\":46,\"endIndex\":47},{\"id\":\"8\",\"categoryId\":\"0\",\"startIndex\":78,\"endIndex\":80},{\"id\":\"9\",\"categoryId\":\"1\",\"startIndex\":64,\"endIndex\":65}],\"connectionCategories\":[{\"id\":\"0\",\"text\":\"\"},{\"id\":\"1\",\"text\":\"\"},{\"id\":\"2\",\"text\":\"...\"}],\"connections\":[{\"id\":\"0\",\"categoryId\":\"2\",\"fromId\":\"7\",\"toId\":\"6\"},{\"id\":\"1\",\"categoryId\":\"0\",\"fromId\":\"3\",\"toId\":\"2\"},{\"id\":\"2\",\"categoryId\":\"0\",\"fromId\":\"5\",\"toId\":\"4\"},{\"id\":\"3\",\"categoryId\":\"2\",\"fromId\":\"9\",\"toId\":\"6\"}]}";
const defaultJson = "{\"content\":\"%s\",\"labelCategories\":[{\"id\":\"0\",\"text\":\"名词\",\"color\":\"#eac0a2\",\"border-color\":\"#8c7361\"},{\"id\":\"1\",\"text\":\"动词\",\"color\":\"#619dff\",\"border-color\":\"#3c619d\"},{\"id\":\"2\",\"text\":\"形容词\",\"color\":\"#9d61ff\",\"border-color\":\"#613C9D\"},{\"id\":\"3\",\"text\":\"副词\",\"color\":\"#ff9d61\",\"border-color\":\"#995e3a\"}],\"labels\":[],\"connectionCategories\":[],\"connections\":[]}";
import {getInfoDelHtml} from "@/api/biemo/docInfo";
//const defaultJson = "{\"content\":\"%s\",\"labelCategories\":[{\"id\":\"0\",\"text\":\"\",\"color\":\"#eac0a2\",\"border-color\":\"#8c7361\"},{\"id\":\"1\",\"text\":\"\",\"color\":\"#619dff\",\"border-color\":\"#3c619d\"},{\"id\":\"2\",\"text\":\"\",\"color\":\"#9d61ff\",\"border-color\":\"#613C9D\"},{\"id\":\"3\",\"text\":\"\",\"color\":\"#ff9d61\",\"border-color\":\"#995e3a\"}],\"labels\":[],\"connectionCategories\":[],\"connections\":[]}";
import {getInfoDelHtml,uploadPosTag} from "@/api/biemo/docInfo";
import { MessageBox } from 'element-ui'
export default {
name: 'posTagging',
data: function () {
return {
id: '',
loading: false,
defaultJson: null,
labelCategoryRepo: [{
"id": 0,
"text": "名词",
"color": "#eac0a2",
"border-color": "#8c7361"
}, {
},{
"id": 1,
"text": "人名",
"color": "#eac0a2",
"border-color": "#8c7361"
},{
"id": 2,
"text": "地名",
"color": "#eac0a2",
"border-color": "#8c7361"
}, {
"id": 3,
"text": "机构团体",
"color": "#eac0a2",
"border-color": "#8c7361"
},{
"id": 4,
"text": "动词",
"color": "#619dff",
"border-color": "#3c619d"
}, {
"id": 2,
"id": 5,
"text": "形容词",
"color": "#9d61ff",
"border-color": "#613C9D"
}, {
"id": 3,
"id": 6,
"text": "副词",
"color": "#ff9d61",
"border-color": "#995e3a"
@ -199,13 +217,9 @@ export default {
}
},
mounted:function(){
getInfoDelHtml(this.id).then(response =>{
let json = defaultJson.replace(/%s/ig,response.data.content);
this.constructAnnotator(json);
})
console.log(this.defaultJson);
},
created(){
created() {
this.getParams()
},
watch: {
@ -213,7 +227,15 @@ export default {
},
methods: {
getParams(){
this.loading = true;
this.id = this.$route.params && this.$route.params.id;
getInfoDelHtml(this.id).then(response =>{
this.defaultJson = response.data;
this.labelCategoryRepo = response.data.labelCategories;
this.connectionCategoryRepo = response.data.connectionCategories;
this.constructAnnotator(this.defaultJson);
this.loading = false;
})
},
constructAnnotator: function (data) {
this.annotator = new Annotator(data, this.$refs.svgContainer);
@ -244,10 +266,15 @@ export default {
},
useDefault: function () {
this.uploaded = true;
this.constructAnnotator(defaultJson);
this.constructAnnotator(this.defaultJson);
},
upload: function (e) {
uploadPosTag(this.annotator.store.json).then(response =>{
if(response.code==200){
MessageBox.alert("上传成功", "系统提示")
this.reload();
}
});
},
createLabel() {
this.annotator.applyAction(Action.Label.Create(this.selectedLabelCategory, this.startIndex, this.endIndex));

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save