常用工具类

  1 import java.io.*;
  2 import java.util.Collection;
  3 import java.util.Iterator;
  4 import java.util.List;
  5 
  6 /**
  7  * 文本工具类
  8  */
  9 public class TextUtility
 10 {
 11 
 12     /**
 13      * 单字节
 14      */
 15     public static final int CT_SINGLE = 5;// SINGLE byte
 16 
 17     /**
 18      * 分隔符"!,.?()[]{}+=
 19      */
 20     public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter
 21 
 22     /**
 23      * 中文字符
 24      */
 25     public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char
 26 
 27     /**
 28      * 字母
 29      */
 30     public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin
 31 
 32     /**
 33      * 数字
 34      */
 35     public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin
 36 
 37     /**
 38      * 序号
 39      */
 40     public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin
 41 
 42     /**
 43      * 中文数字
 44      */
 45     public static final int CT_CNUM = CT_SINGLE + 6;
 46 
 47     /**
 48      * 其他
 49      */
 50     public static final int CT_OTHER = CT_SINGLE + 12;// Other
 51 
 52     public static int charType(char c)
 53     {
 54         return charType(String.valueOf(c));
 55     }
 56 
 57     /**
 58      * 判断字符类型
 59      * @param str
 60      * @return
 61      */
 62     public static int charType(String str)
 63     {
 64         if (str != null && str.length() > 0)
 65         {
 66             if ("零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".contains(str)) return CT_CNUM;
 67             byte[] b;
 68             try
 69             {
 70                 b = str.getBytes("GBK");
 71             }
 72             catch (UnsupportedEncodingException e)
 73             {
 74                 b = str.getBytes();
 75                 e.printStackTrace();
 76             }
 77             byte b1 = b[0];
 78             byte b2 = b.length > 1 ? b[1] : 0;
 79             int ub1 = getUnsigned(b1);
 80             int ub2 = getUnsigned(b2);
 81             if (ub1 < 128)
 82             {
 83                 if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS
 84                 if (' ' == b1) return CT_OTHER;
 85                 if ('\n' == b1) return CT_DELIMITER;
 86                 if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1)
 87                     return CT_DELIMITER;
 88                 if ("0123456789".indexOf((char)b1) != -1)
 89                     return CT_NUM;
 90                 return CT_SINGLE;
 91             }
 92             else if (ub1 == 162)
 93                 return CT_INDEX;
 94             else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
 95                 return CT_NUM;
 96             else if (ub1 == 163
 97                     && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
 98                     && ub2 <= 250))
 99                 return CT_LETTER;
100             else if (ub1 == 161 || ub1 == 163)
101                 return CT_DELIMITER;
102             else if (ub1 >= 176 && ub1 <= 247)
103                 return CT_CHINESE;
104 
105         }
106         return CT_OTHER;
107     }
108 
109     /**
110      * 是否全是中文
111      * @param str
112      * @return
113      */
114     public static boolean isAllChinese(String str)
115     {
116         return str.matches("[\\u4E00-\\u9FA5]+");
117     }
118     /**
119      * 是否全部不是中文
120      * @param sString
121      * @return
122      */
123     public static boolean isAllNonChinese(byte[] sString)
124     {
125         int nLen = sString.length;
126         int i = 0;
127 
128         while (i < nLen)
129         {
130             if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
131                 return false;
132             if (sString[i] < 0)
133                 i += 2;
134             else
135                 i += 1;
136         }
137         return true;
138     }
139 
140     /**
141      * 是否全是单字节
142      * @param str
143      * @return
144      */
145     public static boolean isAllSingleByte(String str)
146     {
147         assert str != null;
148         for (int i = 0; i < str.length(); i++)
149         {
150             if (str.charAt(i) >128)
151             {
152                 return false;
153             }
154         }
155         return true;
156     }
157 
158     /**
159      * 把表示数字含义的字符串转成整形
160      *
161      * @param str 要转换的字符串
162      * @return 如果是有意义的整数,则返回此整数值。否则,返回-1。
163      */
164     public static int cint(String str)
165     {
166         if (str != null)
167             try
168             {
169                 int i = new Integer(str).intValue();
170                 return i;
171             }
172             catch (NumberFormatException e)
173             {
174 
175             }
176 
177         return -1;
178     }
179     /**
180      * 是否全是数字
181      * @param str
182      * @return
183      */
184     public static boolean isAllNum(String str)
185     {
186         if (str == null)
187             return false;
188 
189         int i = 0;
190         /** 判断开头是否是+-之类的符号 */
191         if ("±+-+-—".indexOf(str.charAt(0)) != -1)
192             i++;
193         /** 如果是全角的0123456789 字符* */
194         while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
195             i++;
196         // Get middle delimiter such as .
197         if (i > 0 && i < str.length())
198         {
199             char ch = str.charAt(i);
200             if ("·∶:,,..//".indexOf(ch) != -1)
201             {// 98.1%
202                 i++;
203                 while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
204                     i++;
205             }
206         }
207         if (i >= str.length())
208             return true;
209 
210         /** 如果是半角的0123456789字符* */
211         while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
212             i++;
213         // Get middle delimiter such as .
214         if (i > 0 && i < str.length())
215         {
216             char ch = str.charAt(i);
217             if (',' == ch || '.' == ch || '/' == ch  || ':' == ch || "∶·,./".indexOf(ch) != -1)
218             {// 98.1%
219                 i++;
220                 while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
221                     i++;
222             }
223         }
224 
225         if (i < str.length())
226         {
227             if ("百千万亿佰仟%%‰".indexOf(str.charAt(i)) != -1)
228                 i++;
229         }
230         if (i >= str.length())
231             return true;
232 
233         return false;
234     }
235 
236     /**
237      * 是否全是序号
238      * @param sString
239      * @return
240      */
241     public static boolean isAllIndex(byte[] sString)
242     {
243         int nLen = sString.length;
244         int i = 0;
245 
246         while (i < nLen - 1 && getUnsigned(sString[i]) == 162)
247         {
248             i += 2;
249         }
250         if (i >= nLen)
251             return true;
252         while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
253                 || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1))
254         {// single
255             // byte
256             // number
257             // char
258             i += 1;
259         }
260 
261         if (i < nLen)
262             return false;
263         return true;
264 
265     }
266 
267     /**
268      * 是否全为英文
269      *
270      * @param text
271      * @return
272      */
273     public static boolean isAllLetter(String text)
274     {
275         for (int i = 0; i < text.length(); ++i)
276         {
277             char c = text.charAt(i);
278             if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z'))))
279             {
280                 return false;
281             }
282         }
283 
284         return true;
285     }
286 
287     /**
288      * 是否全为英文或字母
289      *
290      * @param text
291      * @return
292      */
293     public static boolean isAllLetterOrNum(String text)
294     {
295         for (int i = 0; i < text.length(); ++i)
296         {
297             char c = text.charAt(i);
298             if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9'))))
299             {
300                 return false;
301             }
302         }
303 
304         return true;
305     }
306 
307     /**
308      * 是否全是分隔符
309      * @param sString
310      * @return
311      */
312     public static boolean isAllDelimiter(byte[] sString)
313     {
314         int nLen = sString.length;
315         int i = 0;
316 
317         while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163))
318         {
319             i += 2;
320         }
321         if (i < nLen)
322             return false;
323         return true;
324     }
325 
326     /**
327      * 是否全是中国数字
328      * @param word
329      * @return
330      */
331     public static boolean isAllChineseNum(String word)
332     {// 百分之五点六的人早上八点十八分起床
333 
334         String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";//
335         String prefix = "几数上第";
336         String surfix = "几多余来成倍";
337         boolean round = false;
338 
339         if (word == null)
340             return false;
341 
342         char[] temp = word.toCharArray();
343         for (int i = 0; i < temp.length; i++)
344         {
345             if (word.startsWith("分之", i))// 百分之五
346             {
347                 i += 1;
348                 continue;
349             }
350             char tchar = temp[i];
351             if (i == 0 && prefix.indexOf(tchar) != -1)
352             {
353                 round = true;
354             }
355             else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1)
356             {
357                 round = true;
358             }
359             else if (chineseNum.indexOf(tchar) == -1)
360                 return false;
361         }
362         return true;
363     }
364 
365 
366     /**
367      * 得到字符集的字符在字符串中出现的次数
368      *
369      * @param charSet
370      * @param word
371      * @return
372      */
373     public static int getCharCount(String charSet, String word)
374     {
375         int nCount = 0;
376 
377         if (word != null)
378         {
379             String temp = word + " ";
380             for (int i = 0; i < word.length(); i++)
381             {
382                 String s = temp.substring(i, i + 1);
383                 if (charSet.indexOf(s) != -1)
384                     nCount++;
385             }
386         }
387 
388         return nCount;
389     }
390 
391 
392     /**
393      * 获取字节对应的无符号整型数
394      *
395      * @param b
396      * @return
397      */
398     public static int getUnsigned(byte b)
399     {
400         if (b > 0)
401             return (int) b;
402         else
403             return (b & 0x7F + 128);
404     }
405 
406     /**
407      * 判断字符串是否是年份
408      *
409      * @param snum
410      * @return
411      */
412     public static boolean isYearTime(String snum)
413     {
414         if (snum != null)
415         {
416             int len = snum.length();
417             String first = snum.substring(0, 1);
418 
419             // 1992年, 98年,06年
420             if (isAllSingleByte(snum)
421                     && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0)))
422                 return true;
423             if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1))
424                 return true;
425             if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2)
426                 return true;
427             if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年
428                 return true;
429             if (len == 1 && getCharCount("千仟", snum) == 1)
430                 return true;
431             if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1
432                     && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1)
433                 return true;
434         }
435         return false;
436     }
437 
438     /**
439      * 判断一个字符串的所有字符是否在另一个字符串集合中
440      *
441      * @param aggr 字符串集合
442      * @param str  需要判断的字符串
443      * @return
444      */
445     public static boolean isInAggregate(String aggr, String str)
446     {
447         if (aggr != null && str != null)
448         {
449             str += "1";
450             for (int i = 0; i < str.length(); i++)
451             {
452                 String s = str.substring(i, i + 1);
453                 if (aggr.indexOf(s) == -1)
454                     return false;
455             }
456             return true;
457         }
458 
459         return false;
460     }
461 
462     /**
463      * 判断该字符串是否是半角字符
464      *
465      * @param str
466      * @return
467      */
468     public static boolean isDBCCase(String str)
469     {
470         if (str != null)
471         {
472             str += " ";
473             for (int i = 0; i < str.length(); i++)
474             {
475                 String s = str.substring(i, i + 1);
476                 int length = 0;
477                 try
478                 {
479                     length = s.getBytes("GBK").length;
480                 }
481                 catch (UnsupportedEncodingException e)
482                 {
483                     e.printStackTrace();
484                     length = s.getBytes().length;
485                 }
486                 if (length != 1)
487                     return false;
488             }
489 
490             return true;
491         }
492 
493         return false;
494     }
495 
496     /**
497      * 判断该字符串是否是全角字符
498      *
499      * @param str
500      * @return
501      */
502     public static boolean isSBCCase(String str)
503     {
504         if (str != null)
505         {
506             str += " ";
507             for (int i = 0; i < str.length(); i++)
508             {
509                 String s = str.substring(i, i + 1);
510                 int length = 0;
511                 try
512                 {
513                     length = s.getBytes("GBK").length;
514                 }
515                 catch (UnsupportedEncodingException e)
516                 {
517                     e.printStackTrace();
518                     length = s.getBytes().length;
519                 }
520                 if (length != 2)
521                     return false;
522             }
523 
524             return true;
525         }
526 
527         return false;
528     }
529 
530     /**
531      * 判断是否是一个连字符(分隔符)
532      *
533      * @param str
534      * @return
535      */
536     public static boolean isDelimiter(String str)
537     {
538         if (str != null && ("-".equals(str) || "-".equals(str)))
539             return true;
540         else
541             return false;
542     }
543 
544     public static boolean isUnknownWord(String word)
545     {
546         if (word != null && word.indexOf("未##") == 0)
547             return true;
548         else
549             return false;
550     }
551 
552     /**
553      * 防止频率为0发生除零错误
554      *
555      * @param frequency
556      * @return
557      */
558     public static double nonZero(double frequency)
559     {
560         if (frequency == 0) return 1e-3;
561 
562         return frequency;
563     }
564 
565     /**
566      * 转换long型为char数组
567      *
568      * @param x
569      */
570     public static char[] long2char(long x)
571     {
572         char[] c = new char[4];
573         c[0] = (char) (x >> 48);
574         c[1] = (char) (x >> 32);
575         c[2] = (char) (x >> 16);
576         c[3] = (char) (x);
577         return c;
578     }
579 
580     /**
581      * 转换long类型为string
582      *
583      * @param x
584      * @return
585      */
586     public static String long2String(long x)
587     {
588         char[] cArray = long2char(x);
589         StringBuilder sbResult = new StringBuilder(cArray.length);
590         for (char c : cArray)
591         {
592             sbResult.append(c);
593         }
594         return sbResult.toString();
595     }
596 
597     /**
598      * 将异常转为字符串
599      *
600      * @param e
601      * @return
602      */
603     public static String exceptionToString(Exception e)
604     {
605         StringWriter sw = new StringWriter();
606         PrintWriter pw = new PrintWriter(sw);
607         e.printStackTrace(pw);
608         return sw.toString();
609     }
610 
611     /**
612      * 判断某个字符是否为汉字
613      *
614      * @param c 需要判断的字符
615      * @return 是汉字返回true,否则返回false
616      */
617     public static boolean isChinese(char c)
618     {
619         String regex = "[\\u4e00-\\u9fa5]";
620         return String.valueOf(c).matches(regex);
621     }
622 
623     /**
624      * 统计 keyword 在 srcText 中的出现次数
625      *
626      * @param keyword
627      * @param srcText
628      * @return
629      */
630     public static int count(String keyword, String srcText)
631     {
632         int count = 0;
633         int leng = srcText.length();
634         int j = 0;
635         for (int i = 0; i < leng; i++)
636         {
637             if (srcText.charAt(i) == keyword.charAt(j))
638             {
639                 j++;
640                 if (j == keyword.length())
641                 {
642                     count++;
643                     j = 0;
644                 }
645             }
646             else
647             {
648                 i = i - j;// should rollback when not match
649                 j = 0;
650             }
651         }
652 
653         return count;
654     }
655 
656     /**
657      * 简单好用的写String方式
658      *
659      * @param s
660      * @param out
661      * @throws IOException
662      */
663     public static void writeString(String s, DataOutputStream out) throws IOException
664     {
665         out.writeInt(s.length());
666         for (char c : s.toCharArray())
667         {
668             out.writeChar(c);
669         }
670     }
671 
672     /**
673      * 判断字符串是否为空(null和空格)
674      *
675      * @param cs
676      * @return
677      */
678     public static boolean isBlank(CharSequence cs)
679     {
680         int strLen;
681         if (cs == null || (strLen = cs.length()) == 0)
682         {
683             return true;
684         }
685         for (int i = 0; i < strLen; i++)
686         {
687             if (!Character.isWhitespace(cs.charAt(i)))
688             {
689                 return false;
690             }
691         }
692         return true;
693     }
694 
695     public static String join(String delimiter, Collection<String> stringCollection)
696     {
697         StringBuilder sb = new StringBuilder(stringCollection.size() * (16 + delimiter.length()));
698         for (String str : stringCollection)
699         {
700             sb.append(str).append(delimiter);
701         }
702 
703         return sb.toString();
704     }
705 
706     public static String combine(String... termArray)
707     {
708         StringBuilder sbSentence = new StringBuilder();
709         for (String word : termArray)
710         {
711             sbSentence.append(word);
712         }
713         return sbSentence.toString();
714     }
715 
716     public static String join(Iterable<? extends CharSequence> s, String delimiter)
717     {
718         Iterator<? extends CharSequence> iter = s.iterator();
719         if (!iter.hasNext()) return "";
720         StringBuilder buffer = new StringBuilder(iter.next());
721         while (iter.hasNext()) buffer.append(delimiter).append(iter.next());
722         return buffer.toString();
723     }
724 
725     public static String combine(Sentence sentence)
726     {
727         StringBuilder sb = new StringBuilder(sentence.wordList.size() * 3);
728         for (IWord word : sentence.wordList)
729         {
730             sb.append(word.getValue());
731         }
732 
733         return sb.toString();
734     }
735 
736     public static String combine(List<Word> wordList)
737     {
738         StringBuilder sb = new StringBuilder(wordList.size() * 3);
739         for (IWord word : wordList)
740         {
741             sb.append(word.getValue());
742         }
743 
744         return sb.toString();
745     }
746 }

来源:https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/utility/TextUtility.java

posted @ 2018-07-10 17:40  Jony.K.Chen  阅读(343)  评论(0编辑  收藏  举报