feature(地址清洗):地址清洗开发

1. 地址分为3类(单套、楼栋、小区、包含多个楼栋或者单套)
2. 基于联城数库中的路名拆分,保证联城中的路名完全匹配;
3. 联城数库中的小区路名清洗(XX路、XX大道)
This commit is contained in:
purple 2020-08-05 18:45:47 +08:00
parent 27db250c13
commit daca46924b
14 changed files with 847 additions and 10 deletions

View File

@ -0,0 +1,16 @@
package com.ruoyi.project.tool.address;
/**
* 地址构建
*
* @author lihe
*/
public interface AddressBuilder {
/**
* 清洗
* @param text
* @return
*/
StandardAddress clear(String text);
}

View File

@ -0,0 +1,30 @@
package com.ruoyi.project.tool.address;
import com.ruoyi.project.tool.address.model.AddressType;
/**
* 地址
*
* @author lihe
*/
public class AddressContent {
private AddressType addressType;
private StringBuilder sb;
public AddressContent(AddressType addressType) {
this.addressType = addressType;
sb = new StringBuilder();
}
public AddressType getAddressType() {
return addressType;
}
public void appendContent(String addressNodeContent) {
sb.append(addressNodeContent);
}
public String getResult() {
return sb.toString();
}
}

View File

@ -0,0 +1,42 @@
package com.ruoyi.project.tool.address;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* context
*
* @author lihe
*/
public class AddressContext {
/**
* 有序列表
* 优化成队列FIFO
*/
private ArrayList<AddressNode> nodeList = new ArrayList<>();
private StringBuilder stringBuilder;
public AddressContext(String text) {
this.stringBuilder = new StringBuilder(text);
}
public StringBuilder getContent() {
return this.stringBuilder;
}
public void addAddressNode(AddressNode node) {
this.nodeList.add(node);
}
public StandardAddress getResult() {
StandardAddress standardAddress = new StandardAddress(this.stringBuilder.toString());
// nodeList.forEach(node->{
//// AddressContent addressContent = new AddressContent();
//
//// standardAddress.addAddressContent();
// });
return standardAddress;
}
}

View File

@ -0,0 +1,65 @@
package com.ruoyi.project.tool.address;
import java.util.AbstractList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
/**
* 地址节点
*
* @author lihe
*/
public class AddressNode {
private AddressNodeType nodeType;
private Integer startIndex;
private Integer endIndex;
private String content;
private List<AddressNode> contentList = new LinkedList<>();
public AddressNode(AddressNodeType addressNodeType) {
this.nodeType = addressNodeType;
}
public AddressNode(AddressNodeType addressNodeType, String content) {
this.nodeType = addressNodeType;
this.content = content;
}
public AddressNodeType getNodeType() {
return nodeType;
}
public void setNodeType(AddressNodeType nodeType) {
this.nodeType = nodeType;
}
public Integer getStartIndex() {
return startIndex;
}
public void setStartIndex(Integer startIndex) {
this.startIndex = startIndex;
}
public Integer getEndIndex() {
return endIndex;
}
public void setEndIndex(Integer endIndex) {
this.endIndex = endIndex;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public void addChildNode(AddressNode addressNode) {
contentList.add(addressNode);
}
}

View File

@ -0,0 +1,32 @@
package com.ruoyi.project.tool.address;
/**
* 地址节点类型
*
* @author lihe
*/
public enum AddressNodeType {
DISTRICT("区域"),
REGION(""),
BLOCK("板块"),
NONG(""),
ROAD(""),
HAO(""),
MULTI_HAO("多号"),
ZHUANG(""),
ZUO(""),
CENG(""),
// 地下层底商
XIXIA_CENG("地下层"),
SHI("");
private String name;
private AddressNodeType(String name) {
this.name = name;
}
public String getName() {
return name;
}
}

View File

@ -0,0 +1,29 @@
package com.ruoyi.project.tool.address;
import com.ruoyi.project.tool.address.model.AddressType;
import java.util.LinkedList;
import java.util.List;
/**
* context
*
* @author lihe
*/
public class StandardAddress {
private String rawAddress;
private List<AddressContent> children = new LinkedList<>();
public StandardAddress(String text) {
this.rawAddress = text;
}
public void addAddressContent(AddressContent addressContent) {
children.add(addressContent);
}
public List<AddressContent> getResult() {
return children;
}
}

View File

@ -141,7 +141,6 @@ public class CleanAddressBuilder {
cleanAddress.setIndependent(Boolean.TRUE);
}
return this;
}

View File

@ -1,18 +1,82 @@
package com.ruoyi.project.tool.address.service;
import com.sun.xml.internal.fastinfoset.algorithm.BooleanEncodingAlgorithm;
import java.util.List;
/**
* 地址构建者模式
* @author lihe
*/
public interface AddressBuilder {
/**
* 区域
*
* @return
*/
String parseDistrict();
/**
*
*
* @return
*/
String parseRegion();
/**
* 板块
*
* @return
*/
String parseBlock();
String parseRoad();
/**
*
*
* @return
*/
List<String> parseRoad();
/**
* 小区名称
*
* @return
*/
String parseCommunityName();
/**
*
*
* @return
*/
String parseNONG();
String parseHao();
/**
*
*
* @return
*/
List<String> parseHao();
/**
*
*
* @return
*/
String parseShi();
/**
* 楼层地下
*
* @return
*/
String parseFloor();
/**
* 是否独栋
* @return
*/
Boolean parseIndependent();
}

View File

@ -0,0 +1,52 @@
package com.ruoyi.project.tool.address.service;
import com.ruoyi.project.tool.address.model.CleanAddress;
import java.util.LinkedList;
import java.util.List;
/**
* 地址清洗
*
* @author lihe
*/
public class AddressCleanUtils {
private AddressBuilder builder;
private static List<String> specialChar = new LinkedList<>();
static {
specialChar.add("");
specialChar.add(".");
specialChar.add("");
specialChar.add(",");
specialChar.add("-");
specialChar.add("——");
specialChar.add("_");
specialChar.add("");
specialChar.add("");
specialChar.add("");
specialChar.add("(");
specialChar.add(")");
specialChar.add("");
specialChar.add("");
}
/**
* @param text
* @return
*/
public List<CleanAddress> clear(String text) {
String todoAddress = text.trim()
.replace("\t", "")
.replace(" ", "");
for (int i = 0; i < specialChar.size(); i++) {
if (todoAddress.contains(specialChar.get(i))) {
builder = new DefaultAddressBuilder(todoAddress);
}
}
return null;
}
}

View File

@ -1,14 +1,191 @@
package com.ruoyi.project.tool.address.service;
import com.ruoyi.common.utils.LoadUtil;
import com.ruoyi.project.tool.address.model.AddressType;
import com.ruoyi.project.tool.address.model.CleanAddress;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 地址构建基类
*
* @author lihe
*/
public abstract class BaseAddressBuilder implements AddressBuilder {
protected StringBuilder stringBuilder;
private Map<String, String> districtMap = LoadUtil.loadDict("address-dict/district.dict");
private List<String> regionList = LoadUtil.loadList("address-dict/region.dict");
private List<String> blockList = LoadUtil.loadList("address-dict/block.dict");
private List<String> roadList = LoadUtil.loadList("address-dict/road.dict");
private static final int DISTRICT_LENGTH = 8;
private static final String SHANGHAI_SHI = "上海市";
private static final String SHANGHAI = "上海";
private static final String SHANG = "";
private static final String HAO = "";
private static final String SHANG_SHANG = "上上";
private static final String CHINESE_FLOOR_PATTERN = "([一二三四五六七八九十]+)层";
private static final String NUMBER_FLOOR_PATTERN = "(\\d+)层";
private static final String DEFAULT_HAO_PATTERN = "([\\dA-Za-z]+)号";
private static final String DEFAULT_SHI_PATTERN = "([\\dA-Za-z]+)(室?)$";
private static final String DEFAULT_NONG_PATTERN = "[\\d一二三四五六七八九十]+弄([\\d一二三四五六七八九十]+支弄)?";
protected CleanContext context;
public BaseAddressBuilder(String text) {
this.stringBuilder = new StringBuilder(text);
this.context = new CleanContext(text);
}
/**
* 地址类别单套楼栋小区
*
* @return
*/
public abstract AddressType getAddressType();
@Override
public String parseDistrict() {
String shanghaiAndDistrict = null;
if (this.context.getBoundAddress().length() >= DISTRICT_LENGTH) {
shanghaiAndDistrict = (SHANG + this.context.getBoundAddress().substring(0, 7)).replace(SHANG_SHANG, SHANG);
} else {
shanghaiAndDistrict = (SHANG + this.context.getBoundAddress()).replace(SHANG_SHANG, SHANG);
}
if (!shanghaiAndDistrict.startsWith(SHANGHAI_SHI) && !shanghaiAndDistrict.startsWith(SHANGHAI) && shanghaiAndDistrict.startsWith(SHANG)) {
shanghaiAndDistrict = shanghaiAndDistrict.substring(1, shanghaiAndDistrict.length() - 1);
}
for (Map.Entry<String, String> district : districtMap.entrySet()) {
if (shanghaiAndDistrict.startsWith(district.getKey())) {
return district.getValue();
}
}
return null;
}
@Override
public String parseRegion() {
for (int i = 0; i < regionList.size(); i++) {
if (-1 != this.context.getBoundAddress().indexOf(regionList.get(i))) {
return regionList.get(i);
}
}
return null;
}
@Override
public String parseBlock() {
for (int i = 0; i < blockList.size(); i++) {
if (-1 != this.context.getBoundAddress().indexOf(blockList.get(i))) {
return regionList.get(i);
}
}
return null;
}
@Override
public List<String> parseRoad() {
List<String> roadSegmentList = new LinkedList<>();
for (int i = 0; i < roadList.size(); i++) {
if (-1 != this.context.getBoundAddress().indexOf(roadList.get(i))) {
roadSegmentList.add(roadList.get(i));
}
}
if (0 != roadSegmentList.size()) {
return roadSegmentList;
}
for (int i = 0; i < roadList.size(); i++) {
// 砍掉
String noRoad = roadList.get(i);
if (noRoad.endsWith("")) {
noRoad = noRoad.substring(0, noRoad.length() - 1);
}
if (-1 != this.context.getBoundAddress().indexOf(noRoad)) {
roadSegmentList.add(noRoad);
}
}
return roadSegmentList;
}
@Override
public String parseCommunityName() {
return null;
}
@Override
public String parseNONG() {
Pattern shiPattern = Pattern.compile(DEFAULT_NONG_PATTERN);
Matcher matcher = shiPattern.matcher(this.context.getBoundAddress());
if (matcher.find()) {
return matcher.group(0);
}
return null;
}
@Override
public List<String> parseHao() {
List<String> haoSegmentList = new LinkedList<>();
Pattern shiPattern = Pattern.compile(DEFAULT_HAO_PATTERN);
Matcher matcher = shiPattern.matcher(this.context.getBoundAddress());
while (matcher.find()) {
haoSegmentList.add(matcher.group());
}
if (0 != haoSegmentList.size()) {
return haoSegmentList;
} else {
return null;
}
}
@Override
public String parseShi() {
Pattern shiPattern = Pattern.compile(DEFAULT_SHI_PATTERN);
Matcher matcher = shiPattern.matcher(this.context.getBoundAddress());
if (matcher.find()) {
return matcher.group(1);
}
return null;
}
@Override
public String parseFloor() {
Pattern pattern = Pattern.compile(CHINESE_FLOOR_PATTERN);
Matcher matcher = pattern.matcher(this.context.getBoundAddress());
if (matcher.find()) {
return matcher.group(1);
}
pattern = Pattern.compile(NUMBER_FLOOR_PATTERN);
matcher = pattern.matcher(this.context.getBoundAddress());
if (matcher.find()) {
return matcher.group(1);
}
return null;
}
@Override
public Boolean parseIndependent() {
Pattern shiPattern = Pattern.compile(DEFAULT_SHI_PATTERN);
Matcher matcher = shiPattern.matcher(this.context.getBoundAddress());
if (matcher.find()) {
return false;
}
return true;
}
public List<CleanAddress> getResult() {
return this.context.getResult();
}
}

View File

@ -1,16 +1,30 @@
package com.ruoyi.project.tool.address.service;
import com.ruoyi.project.tool.address.model.CleanAddress;
import java.util.List;
/**
* 地址清洗
*
* @author lihe
*/
public class CleanContext {
private StringBuilder sb;
private StringBuilder boundAddress;
private Boolean multiHao;
private String address;
private List<CleanAddress> result;
public StringBuilder getSb() {
return sb;
public CleanContext(String text) {
this.boundAddress = new StringBuilder(text);
}
public void setSb(StringBuilder sb) {
this.sb = sb;
public StringBuilder getBoundAddress() {
return boundAddress;
}
public void setBoundAddress(StringBuilder boundAddress) {
this.boundAddress = boundAddress;
}
public Boolean getMultiHao() {
@ -28,4 +42,12 @@ public class CleanContext {
public void setAddress(String address) {
this.address = address;
}
public List<CleanAddress> getResult() {
return result;
}
public void setResult(List<CleanAddress> result) {
this.result = result;
}
}

View File

@ -0,0 +1,22 @@
package com.ruoyi.project.tool.address.service;
import com.ruoyi.project.tool.address.model.AddressType;
import java.util.List;
/**
* 地址构建基类
*
* @author lihe
*/
public class DefaultAddressBuilder extends BaseAddressBuilder {
public DefaultAddressBuilder(String text) {
super(text);
}
@Override
public AddressType getAddressType() {
return AddressType.CONDO;
}
}

View File

@ -0,0 +1,251 @@
package com.ruoyi.project.tool.address.utils;
import com.ruoyi.common.utils.LoadUtil;
import com.ruoyi.project.tool.address.AddressContext;
import com.ruoyi.project.tool.address.AddressNode;
import com.ruoyi.project.tool.address.AddressNodeType;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 地址段分析
*
* @author lihe
*/
public class AddressNodeParse {
private Map<String, String> districtMap = LoadUtil.loadDict("address-dict/district.dict");
private List<String> regionList = LoadUtil.loadList("address-dict/region.dict");
private List<String> blockList = LoadUtil.loadList("address-dict/block.dict");
private List<String> roadList = LoadUtil.loadList("address-dict/road.dict");
private static final int DISTRICT_LENGTH = 8;
private static final String SHANGHAI_SHI = "上海市";
private static final String SHANGHAI = "上海";
private static final String SHANG = "";
private static final String HAO = "";
private static final String SHANG_SHANG = "上上";
private static final String DEFAULT_FLOOR_PATTERN = "([\\d一二三四五六七八九十]+)层";
private static final String NUMBER_FLOOR_PATTERN = "(\\d+)层";
private static final String DEFAULT_HAO_PATTERN = "([\\dA-Za-z]+)号";
private static final String DEFAULT_ZHUANG_PATTERN = "([\\dA-Za-z]+)幢";
private static final String DEFAULT_CENG_PATTERN = "([\\d])层";
private static final String DEFAULT_SHI_PATTERN = "([\\dA-Za-z]+)(室?)$";
private static final String DEFAULT_NONG_PATTERN = "[\\d一二三四五六七八九十]+弄([\\d一二三四五六七八九十]+支弄)?";
private static List<String> specialChar = new LinkedList<>();
private AddressContext context;
static {
specialChar.add("");
specialChar.add(".");
specialChar.add("");
specialChar.add(",");
specialChar.add("-");
specialChar.add("——");
specialChar.add("_");
specialChar.add("");
specialChar.add("");
specialChar.add("");
specialChar.add("(");
specialChar.add(")");
specialChar.add("");
specialChar.add("");
}
public AddressNodeParse(AddressContext addressContext) {
this.context = addressContext;
}
/**
* 区域
*/
public void parseDistrict() {
String shanghaiAndDistrict = null;
if (this.context.getContent().length() >= DISTRICT_LENGTH) {
shanghaiAndDistrict = (SHANG + this.context.getContent().substring(0, 7)).replace(SHANG_SHANG, SHANG);
} else {
shanghaiAndDistrict = (SHANG + this.context.getContent()).replace(SHANG_SHANG, SHANG);
}
if (!shanghaiAndDistrict.startsWith(SHANGHAI_SHI) && !shanghaiAndDistrict.startsWith(SHANGHAI) && shanghaiAndDistrict.startsWith(SHANG)) {
shanghaiAndDistrict = shanghaiAndDistrict.substring(1, shanghaiAndDistrict.length() - 1);
}
for (Map.Entry<String, String> district : districtMap.entrySet()) {
int index = shanghaiAndDistrict.indexOf(district.getKey());
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.DISTRICT, district.getValue());
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + district.getKey().length() + 1);
this.context.addAddressNode(addressNode);
}
}
}
/**
*
*/
public void parseRegion() {
for (int i = 0; i < regionList.size(); i++) {
int index = this.context.getContent().indexOf(regionList.get(i));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.REGION, regionList.get(i));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + regionList.get(i).length() + 1);
this.context.addAddressNode(addressNode);
}
}
}
/**
* 板块
*/
public void parseBlock() {
for (int i = 0; i < blockList.size(); i++) {
int index = this.context.getContent().indexOf(blockList.get(i));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.BLOCK, blockList.get(i));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + blockList.get(i).length() + 1);
this.context.addAddressNode(addressNode);
}
}
}
/**
*
*/
public void parseRoad() {
AddressNode addressNode = new AddressNode(AddressNodeType.ROAD);
for (int i = 0; i < roadList.size(); i++) {
int index = this.context.getContent().indexOf(roadList.get(i));
if (-1 != index) {
if (null == addressNode.getStartIndex()) {
addressNode.setStartIndex(index);
}
AddressNode childAddressNode = new AddressNode(AddressNodeType.ROAD, roadList.get(i));
childAddressNode.setStartIndex(index);
childAddressNode.setEndIndex(index + roadList.get(i).length() + 1);
addressNode.setEndIndex(index + roadList.get(i).length() + 1);
addressNode.addChildNode(childAddressNode);
}
}
if (null != addressNode.getStartIndex()) {
this.context.addAddressNode(addressNode);
return;
}
for (int i = 0; i < roadList.size(); i++) {
// 砍掉
String noRoad = roadList.get(i);
if (noRoad.endsWith("")) {
noRoad = noRoad.substring(0, noRoad.length() - 1);
}
int index = this.context.getContent().indexOf(noRoad);
if (-1 != index) {
if (null == addressNode.getStartIndex()) {
addressNode.setStartIndex(index);
}
AddressNode childAddressNode = new AddressNode(AddressNodeType.ROAD, noRoad);
childAddressNode.setStartIndex(index);
childAddressNode.setEndIndex(index + roadList.get(i).length() + 1);
addressNode.setEndIndex(index + roadList.get(i).length() + 1);
addressNode.addChildNode(childAddressNode);
}
}
}
/**
*
*/
public void parseNONG() {
Pattern nongPattern = Pattern.compile(DEFAULT_NONG_PATTERN);
Matcher matcher = nongPattern.matcher(this.context.getContent());
if (matcher.find()) {
int index = this.context.getContent().indexOf(matcher.group(0));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.NONG, matcher.group(0));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + matcher.group(0).length() + 1);
addressNode.addChildNode(addressNode);
}
}
}
/**
*
*/
public void parseHao() {
Pattern haoPattern = Pattern.compile(DEFAULT_HAO_PATTERN);
Matcher matcher = haoPattern.matcher(this.context.getContent());
if (matcher.find()) {
int index = this.context.getContent().indexOf(matcher.group(0));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.HAO, matcher.group(0));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + matcher.group(0).length() + 1);
addressNode.addChildNode(addressNode);
}
}
}
/**
*
*/
public void parseZhuang() {
Pattern haoPattern = Pattern.compile(DEFAULT_ZHUANG_PATTERN);
Matcher matcher = haoPattern.matcher(this.context.getContent());
if (matcher.find()) {
int index = this.context.getContent().indexOf(matcher.group(0));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.ZHUANG, matcher.group(0));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + matcher.group(0).length() + 1);
addressNode.addChildNode(addressNode);
}
}
}
/**
*
*/
public void parseCeng() {
Pattern haoPattern = Pattern.compile(DEFAULT_FLOOR_PATTERN);
Matcher matcher = haoPattern.matcher(this.context.getContent());
if (matcher.find()) {
int index = this.context.getContent().indexOf(matcher.group(0));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.CENG, matcher.group(0));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + matcher.group(0).length() + 1);
addressNode.addChildNode(addressNode);
}
}
}
/**
*
*/
public void parseShi() {
Pattern haoPattern = Pattern.compile(DEFAULT_SHI_PATTERN);
Matcher matcher = haoPattern.matcher(this.context.getContent());
if (matcher.find()) {
int index = this.context.getContent().indexOf(matcher.group(0));
if (-1 != index) {
AddressNode addressNode = new AddressNode(AddressNodeType.SHI, matcher.group(0));
addressNode.setStartIndex(index);
addressNode.setEndIndex(index + matcher.group(0).length() + 1);
addressNode.addChildNode(addressNode);
}
}
}
}

View File

@ -0,0 +1,36 @@
package com.ruoyi.project.tool.address.utils;
import com.ruoyi.project.tool.address.AddressBuilder;
import com.ruoyi.project.tool.address.AddressContext;
import com.ruoyi.project.tool.address.AddressNode;
import com.ruoyi.project.tool.address.StandardAddress;
import java.util.List;
/**
* 默认地址构建
*
* @author lihe
*/
public class DefaultAddressBuilder implements AddressBuilder {
private List<AddressNode> addressNodeList;
private AddressContext addressContext;
@Override
public StandardAddress clear(String text) {
this.addressContext = new AddressContext(text);
AddressNodeParse addressNodeParse = new AddressNodeParse(this.addressContext);
// 找到区域
addressNodeParse.parseDistrict();
addressNodeParse.parseRegion();
addressNodeParse.parseBlock();
addressNodeParse.parseRoad();
addressNodeParse.parseNONG();
addressNodeParse.parseHao();
addressNodeParse.parseCeng();
addressNodeParse.parseShi();
return addressContext.getResult();
}
}