一些中文相关的操作方法

package com.opslab.util;


import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* 一些中文相关的操作方法
*/
public final class ChinesUtil {
private ChinesUtil(){

}
/**
* 将字符串中的中文转化为拼音,其他字符不变
*
* @param inputString
* @return
*/
public final static String getPingYin(String inputString) {
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);

char[] input = inputString.trim().toCharArray();
String output = "";

try {
for (int i = 0; i < input.length; i++) {
if (java.lang.Character.toString(input[i]).matches("[\u4E00-\u9FA5]+")) {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(input[i], format);
output += temp[0];
} else
output += java.lang.Character.toString(input[i]);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return output;
}

/**
* 获取汉字串拼音首字母,英文字符不变
*
* @param chinese 汉字串
* @return 汉语拼音首字母
*/
public final static String getFirstSpell(String chinese) {
StringBuffer pybf = new StringBuffer();
char[] arr = chinese.toCharArray();
HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
for (int i = 0; i < arr.length; i++) {
if (arr[i] > 128) {
try {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat);
if (temp != null) {
pybf.append(temp[0].charAt(0));
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
} else {
pybf.append(arr[i]);
}
}
return pybf.toString().replaceAll("\W", "").trim();
}

/**
* 获取汉字串拼音,英文字符不变
*
* @param chinese 汉字串
* @return 汉语拼音
*/
public final static String getFullSpell(String chinese) {
StringBuffer pybf = new StringBuffer();
char[] arr = chinese.toCharArray();
HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
for (int i = 0; i < arr.length; i++) {
if (arr[i] > 128) {
try {
pybf.append(PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat)[0]);
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
} else {
pybf.append(arr[i]);
}
}
return pybf.toString();
}


// 只能判断部分CJK字符(CJK统一汉字)
public final static boolean isChineseByREG(String str) {
if (str == null) {
return false;
}
Pattern pattern = Pattern.compile("[\u4E00-\u9FBF]+");
return pattern.matcher(str.trim()).find();
}

// 只能判断部分CJK字符(CJK统一汉字)
public final static boolean isChineseByName(String str) {
if (str == null) {
return false;
}
// 大小写不同:\p 表示包含,\P 表示不包含
// \p{Cn} 的意思为 Unicode 中未被定义字符的编码,\P{Cn} 就表示 Unicode中已经被定义字符的编码
String reg = "\p{InCJK Unified Ideographs}&&\P{Cn}";
Pattern pattern = Pattern.compile(reg);
return pattern.matcher(str.trim()).find();
}


// 完整的判断中文汉字和符号
public final static boolean isChinese(String strName) {
char[] ch = strName.toCharArray();
for (int i = 0; i < ch.length; i++) {
char c = ch[i];
if (isChinese(c)) {
return true;
}
}
return false;
}

/**
* 判断是否是中文
*
* @param c
* @return
*/
public final static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return true;
}
return false;
}

/**
* 获取一个字符串中中文字符的个数
*/
public final static int ChineseLength(String str) {
Pattern p = Pattern.compile("[u4E00-u9FA5]+");
Matcher m = p.matcher(str);
int i = 0;
while (m.find()) {
String temp = m.group(0);
i += temp.length();
}
return i;
}

/**
* 判断是否是乱码
*
* @param strName
* @return
*/
public final static boolean isMessyCode(String strName) {
Pattern p = Pattern.compile("\s*| *| *| *");
Matcher m = p.matcher(strName);
String after = m.replaceAll("");
String temp = after.replaceAll("\p{P}", "");
char[] ch = temp.trim().toCharArray();
float chLength = 0;
float count = 0;
for (int i = 0; i < ch.length; i++) {
char c = ch[i];
if (!Character.isLetterOrDigit(c)) {
if (!ChinesUtil.isChinese(c)) {
count = count + 1;
}
chLength++;
}
}
float result = count / chLength;
if (result > 0.4) {
return true;
} else {
return false;
}
}
}

原文地址:https://www.cnblogs.com/chinaifae/p/10254805.html