处理特殊字符
某文档操作后得到的字符, 由于Oracle 11g 的字符集原因 ,当前数据库的字符集为 ZHS16GBK . 部分数据内容存储到数据库之后会丢失.
SELECT value FROM NLS_DATABASE_PARAMETERS WHERE parameter = 'NLS_CHARACTERSET';
根据 网上的资料, 一部分 常用的 字符 比如 数字,大写字母,小写字母,汉字,然后就是一些常用的中英文标点符号, 以及 几个特殊的 换行\r\n,以及tab .
已知的为
数字: 48 - 57
大写字母: 65-90
小写字母: 97-122
中文汉字: 19968 - 40869,
其中 123-19967 里面有很多不可见或者无意义的字符
常见的一些 无意义字符为:
static int[] int_invisibility_arr = new int[] { 160,161, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8206, 8207, 8232, 8233, 8234,
8235, 8236, 8237, 8238, 8239, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302 }
对应的 \u code为
return s.Replace("\u00a0", " ").Replace("\u00a1", " ").Replace("\u2000", " ").Replace("\u2001", " ").Replace("\u2002", " ").Replace("\u2003", " ").Replace("\u2004", " ")
.Replace("\u2005", " ").Replace("\u2006", " ").Replace("\u2007", " ").Replace("\u2008", " ").Replace("\u2009", " ")
.Replace("\u200a", " ").Replace("\u200b", " ").Replace("\u200c", " ").Replace("\u200d", " ").Replace("\u200e", " ")
.Replace("\u200f", " ").Replace("\u2028", " ").Replace("\u2029", " ").Replace("\u202a", " ").Replace("\u202b", " ")
.Replace("\u202c", " ").Replace("\u202d", " ").Replace("\u202e", " ").Replace("\u202f", " ").Replace("\u205f", " ")
.Replace("\u2060", " ").Replace("\u2061", " ").Replace("\u2062", " ").Replace("\u2063", " ").Replace("\u2064", " ")
.Replace("\u2065", " ").Replace("\u2066", " ").Replace("\u2067", " ").Replace("\u2068", " ").Replace("\u2069", " ")
.Replace("\u206a", " ").Replace("\u206b", " ").Replace("\u206c", " ").Replace("\u206d", " ").Replace("\u206e", " ");
另外收录一些 可见的.(用for循环输出 123-19967 字符对应的内容 ,有的在vs控制台显示乱码,会输出成? , 然后 把能正常输出的进行标识,得到如下有用的字符[仅供参考])
/// <summary>
/// 一般 可以看到的 字符 char
/// </summary>
static int[] int_visual_arr1 = new int[] { 123,124,125,126,162,163,164,165,166,167,168,170,173,175,176,177,178,179,180,181,183,185,186,192,193,
200,201,202,204,205,208,210,211,215,217,218,220,221,222,224,225,226,232,233,234,236,237,240,242,243,
247,249,250,252,253,254,256,257,274,275,282,283,298,299,324,328,332,333,362,363,449,461,462,463,464,
465,466,467,468,469,470,471,472,473,474,475,476,593,609,711,713,714,715,729,913,914,915,916,917,918,
919,920,921,922,923,924,925,926,927,928,929,931,932,933,934,935,936,937,945,946,947,948,949,950,951,
952,953,954,955,956,957,958,959,960,961,963,964,965,966,967,968,969,1025,1040,1041,1042,1043,1044,1045,1046,
1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,
1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,
1097,1098,1099,1100,1101,1102,1103,1105,8208,8211,8212,8213,8214,8216,8217,8220,8221,8229,8230,8240,8242,8243,8245,8251,8254,
8364,8451,8453,8457,8470,8481,8544,8545,8546,8547,8548,8549,8550,8551,8552,8553,8554,8555,8560,8561,8562,8563,8564,8565,8566,
8567,8568,8569,8592,8593,8594,8595,8598,8599,8600,8601,8712,8719,8721,8725,8728,8730,8733,8734,8735,8736,8739,8741,8743,8744,
8745,8746,8747,8750,8756,8757,8758,8759,8764,8765,8776,8780,8786,8800,8801,8804,8805,8806,8807,8814,8815,8853,8857,8869,8895,
8978,9312,9313,9314,9315,9316,9317,9318,9319,9320,9321,9332,9333,9334,9335,9336,9337,9338,9339,9340,9341,9342,9343,9344,9345,
9346,9347,9348,9349,9350,9351,9352,9353,9354,9355,9356,9357,9358,9359,9360,9361,9362,9363,9364,9365,9366,9367,9368,9369,9370,
9371,9472,9473,9474,9475,9476,9477,9478,9479,9480,9481,9482,9483,9484,9485,9486,9487,9488,9489,9490,9491,9492,9493,9494,9495,
9496,9497,9498,9499,9500,9501,9502,9503,9504,9505,9506,9507,9508,9509,9510,9511,9512,9513,9514,9515,9516,9517,9518,9519,9520,
9521,9522,9523,9524,9525,9526,9527,9528,9529,9530,9531,9532,9533,9534,9535,9536,9537,9538,9539,9540,9541,9542,9543,9544,9545,
9546,9547,9552,9553,9554,9555,9556,9557,9558,9559,9560,9561,9562,9563,9564,9565,9566,9567,9568,9569,9570,9571,9572,9573,9574,
9575,9576,9577,9578,9579,9580,9581,9582,9583,9584,9585,9586,9587,9601,9602,9603,9604,9605,9606,9607,9608,9609,9610,9611,9612,
9613,9614,9615,9619,9620,9621,9632,9633,9650,9651,9660,9661,9670,9671,9675,9678,9679,9698,9699,9700,9701,9733,9734,9737,9792,
9794,12289,12290,12291,12293,12294,12295,12296,12297,12298,12299,12300,12301,12302,12303,12304,12305,12306,12307,12308,12309,12310,12311,12317,12318,
12321,12322,12323,12324,12325,12326,12327,12328,12329,12353,12354,12355,12356,12357,12358,12359,12360,12361,12362,12363,12364,12365,12366,12367,12368,
12369,12370,12371,12372,12373,12374,12375,12376,12377,12378,12379,12380,12381,12382,12383,12384,12385,12386,12387,12388,12389,12390,12391,12392,12393,
12394,12395,12396,12397,12398,12399,12400,12401,12402,12403,12404,12405,12406,12407,12408,12409,12410,12411,12412,12413,12414,12415,12416,12417,12418,
12419,12420,12421,12422,12423,12424,12425,12426,12427,12428,12429,12430,12431,12432,12433,12434,12435,12443,12444,12445,12446,12449,12450,12451,12452,
12453,12454,12455,12456,12457,12458,12459,12460,12461,12462,12463,12464,12465,12466,12467,12468,12469,12470,12471,12472,12473,12474,12475,12476,12477,
12478,12479,12480,12481,12482,12483,12484,12485,12486,12487,12488,12489,12490,12491,12492,12493,12494,12495,12496,12497,12498,12499,12500,12501,12502,
12503,12504,12505,12506,12507,12508,12509,12510,12511,12512,12513,12514,12515,12516,12517,12518,12519,12520,12521,12522,12523,12524,12525,12526,12527,
12528,12529,12530,12531,12532,12533,12534,12540,12541,12542,12549,12550,12551,12552,12553,12554,12555,12556,12557,12558,12559,12560,12561,12562,12563,
12564,12565,12566,12567,12568,12569,12570,12571,12572,12573,12574,12575,12576,12577,12578,12579,12580,12581,12582,12583,12584,12585,12690,12691,12692,
12693,12694,12695,12696,12697,12698,12699,12700,12701,12702,12703,12832,12833,12834,12835,12836,12837,12838,12839,12840,12841,12842,12843,12844,12845,
12846,12847,12848,12849,12850,12851,12852,12853,12854,12855,12856,12857,12858,12859,12860,12861,12862,12863,12864,12865,12866,12867,12928,12929,12930,
12931,12932,12933,12934,12935,12936,12937,12938,12939,12940,12941,12942,12943,12944,12945,12946,12947,12948,12949,12950,12951,12952,12953,12954,12955,
12956,12957,12959,12960,12961,12962,12963,12969,12970,12971,12972,12973,12974,12975,12976,13198,13199,13212,13213,13214,13217,13252,13262,13265,13266,
13269};
提供一个完整的处理类
StringHelper类
public class StringHelper
{
/// <summary>
/// 一般 可以看到的 字符 char
/// </summary>
static int[] int_visual_arr1 = new int[] { 123,124,125,126,162,163,164,165,166,167,168,170,173,175,176,177,178,179,180,181,183,185,186,192,193,
200,201,202,204,205,208,210,211,215,217,218,220,221,222,224,225,226,232,233,234,236,237,240,242,243,
247,249,250,252,253,254,256,257,274,275,282,283,298,299,324,328,332,333,362,363,449,461,462,463,464,
465,466,467,468,469,470,471,472,473,474,475,476,593,609,711,713,714,715,729,913,914,915,916,917,918,
919,920,921,922,923,924,925,926,927,928,929,931,932,933,934,935,936,937,945,946,947,948,949,950,951,
952,953,954,955,956,957,958,959,960,961,963,964,965,966,967,968,969,1025,1040,1041,1042,1043,1044,1045,1046,
1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,
1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,
1097,1098,1099,1100,1101,1102,1103,1105,8208,8211,8212,8213,8214,8216,8217,8220,8221,8229,8230,8240,8242,8243,8245,8251,8254,
8364,8451,8453,8457,8470,8481,8544,8545,8546,8547,8548,8549,8550,8551,8552,8553,8554,8555,8560,8561,8562,8563,8564,8565,8566,
8567,8568,8569,8592,8593,8594,8595,8598,8599,8600,8601,8712,8719,8721,8725,8728,8730,8733,8734,8735,8736,8739,8741,8743,8744,
8745,8746,8747,8750,8756,8757,8758,8759,8764,8765,8776,8780,8786,8800,8801,8804,8805,8806,8807,8814,8815,8853,8857,8869,8895,
8978,9312,9313,9314,9315,9316,9317,9318,9319,9320,9321,9332,9333,9334,9335,9336,9337,9338,9339,9340,9341,9342,9343,9344,9345,
9346,9347,9348,9349,9350,9351,9352,9353,9354,9355,9356,9357,9358,9359,9360,9361,9362,9363,9364,9365,9366,9367,9368,9369,9370,
9371,9472,9473,9474,9475,9476,9477,9478,9479,9480,9481,9482,9483,9484,9485,9486,9487,9488,9489,9490,9491,9492,9493,9494,9495,
9496,9497,9498,9499,9500,9501,9502,9503,9504,9505,9506,9507,9508,9509,9510,9511,9512,9513,9514,9515,9516,9517,9518,9519,9520,
9521,9522,9523,9524,9525,9526,9527,9528,9529,9530,9531,9532,9533,9534,9535,9536,9537,9538,9539,9540,9541,9542,9543,9544,9545,
9546,9547,9552,9553,9554,9555,9556,9557,9558,9559,9560,9561,9562,9563,9564,9565,9566,9567,9568,9569,9570,9571,9572,9573,9574,
9575,9576,9577,9578,9579,9580,9581,9582,9583,9584,9585,9586,9587,9601,9602,9603,9604,9605,9606,9607,9608,9609,9610,9611,9612,
9613,9614,9615,9619,9620,9621,9632,9633,9650,9651,9660,9661,9670,9671,9675,9678,9679,9698,9699,9700,9701,9733,9734,9737,9792,
9794,12289,12290,12291,12293,12294,12295,12296,12297,12298,12299,12300,12301,12302,12303,12304,12305,12306,12307,12308,12309,12310,12311,12317,12318,
12321,12322,12323,12324,12325,12326,12327,12328,12329,12353,12354,12355,12356,12357,12358,12359,12360,12361,12362,12363,12364,12365,12366,12367,12368,
12369,12370,12371,12372,12373,12374,12375,12376,12377,12378,12379,12380,12381,12382,12383,12384,12385,12386,12387,12388,12389,12390,12391,12392,12393,
12394,12395,12396,12397,12398,12399,12400,12401,12402,12403,12404,12405,12406,12407,12408,12409,12410,12411,12412,12413,12414,12415,12416,12417,12418,
12419,12420,12421,12422,12423,12424,12425,12426,12427,12428,12429,12430,12431,12432,12433,12434,12435,12443,12444,12445,12446,12449,12450,12451,12452,
12453,12454,12455,12456,12457,12458,12459,12460,12461,12462,12463,12464,12465,12466,12467,12468,12469,12470,12471,12472,12473,12474,12475,12476,12477,
12478,12479,12480,12481,12482,12483,12484,12485,12486,12487,12488,12489,12490,12491,12492,12493,12494,12495,12496,12497,12498,12499,12500,12501,12502,
12503,12504,12505,12506,12507,12508,12509,12510,12511,12512,12513,12514,12515,12516,12517,12518,12519,12520,12521,12522,12523,12524,12525,12526,12527,
12528,12529,12530,12531,12532,12533,12534,12540,12541,12542,12549,12550,12551,12552,12553,12554,12555,12556,12557,12558,12559,12560,12561,12562,12563,
12564,12565,12566,12567,12568,12569,12570,12571,12572,12573,12574,12575,12576,12577,12578,12579,12580,12581,12582,12583,12584,12585,12690,12691,12692,
12693,12694,12695,12696,12697,12698,12699,12700,12701,12702,12703,12832,12833,12834,12835,12836,12837,12838,12839,12840,12841,12842,12843,12844,12845,
12846,12847,12848,12849,12850,12851,12852,12853,12854,12855,12856,12857,12858,12859,12860,12861,12862,12863,12864,12865,12866,12867,12928,12929,12930,
12931,12932,12933,12934,12935,12936,12937,12938,12939,12940,12941,12942,12943,12944,12945,12946,12947,12948,12949,12950,12951,12952,12953,12954,12955,
12956,12957,12959,12960,12961,12962,12963,12969,12970,12971,12972,12973,12974,12975,12976,13198,13199,13212,13213,13214,13217,13252,13262,13265,13266,
13269};
/// <summary>
/// 一般不可视的 字符char
/// </summary>
static int[] int_invisibility_arr = new int[] { 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8206, 8207, 8232, 8233, 8234,
8235, 8236, 8237, 8238, 8239, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302 };
/// <summary>
/// 获取有用的,可视化的字符,非可视化的字符替换为空格
/// </summary>
/// <param name="s"></param>
/// <returns></returns>
public static string GetUsefulString(string s)
{
char[] arr = s.ToCharArray();
// int[] arr_ts = new int[] { 9, 10, 13 };
//(c >= 48 && c <= 57) || (c >= 97 && c <= 122) || (c >= 65 && c <= 90)|| (c >= 19968 && c <= 40869)
for (int i = 0; i < arr.Length; i++)
{
int c = (int)arr[i];
if (int_invisibility_arr.Contains(c))
{
// 不可用的 字符
// string s = "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u2064\u2065\u2066\u2067\u2068\u2069\u206a\u206b\u206c\u206d\u206e";
// 首先判断一些没用的字符, 常见的 就是这些,替换为空格
arr[i] = ' ';
}
else if (c <= 122 || (c >= 19968 && c <= 40869) || int_visual_arr1.Contains(c))
{
// 此范围内为 有用的字符,保留
continue;
}
else
{
// 不常见的 无用字符,替换为空格
arr[i] = ' ';
}
}
return new string(arr);
}
/// <summary>
/// 处理 unicode 不可视的字符,参见 https://www.cnblogs.com/admans/p/17879366.html
/// </summary>
/// <param name="s"></param>
/// <returns></returns>
public static string GetNoEmptyString(string s)
{
return s.Replace("\u00a0", " ").Replace("\u00a1", " ").Replace("\u2000", " ").Replace("\u2001", " ").Replace("\u2002", " ").Replace("\u2003", " ").Replace("\u2004", " ")
.Replace("\u2005", " ").Replace("\u2006", " ").Replace("\u2007", " ").Replace("\u2008", " ").Replace("\u2009", " ")
.Replace("\u200a", " ").Replace("\u200b", " ").Replace("\u200c", " ").Replace("\u200d", " ").Replace("\u200e", " ")
.Replace("\u200f", " ").Replace("\u2028", " ").Replace("\u2029", " ").Replace("\u202a", " ").Replace("\u202b", " ")
.Replace("\u202c", " ").Replace("\u202d", " ").Replace("\u202e", " ").Replace("\u202f", " ").Replace("\u205f", " ")
.Replace("\u2060", " ").Replace("\u2061", " ").Replace("\u2062", " ").Replace("\u2063", " ").Replace("\u2064", " ")
.Replace("\u2065", " ").Replace("\u2066", " ").Replace("\u2067", " ").Replace("\u2068", " ").Replace("\u2069", " ")
.Replace("\u206a", " ").Replace("\u206b", " ").Replace("\u206c", " ").Replace("\u206d", " ").Replace("\u206e", " ");
}
}
本文来自博客园,作者:兴想事成,转载请注明原文链接:https://www.cnblogs.com/mjxxsc/p/18142903