最新IP地址数据库 二分逼近&二分查找 高效解析800万大数据之区域分布
最新IP地址数据库 来自 qqzeng.com
利用二分逼近法(bisection method) ,每秒500多万, 比较高效!
多语言API解析Dat 导入数据库脚本
https://github.com/zengzhan/qqzeng-ip
原来的顺序查找算法 效率比较低
readonly string ipBinaryFilePath = "qqzengipdb.dat"; readonly byte[] dataBuffer, indexBuffer; readonly uint[] index = new uint[256]; readonly int dataLength; public IpLocation() { try { FileInfo file = new FileInfo(ipBinaryFilePath); dataBuffer = new byte[file.Length]; using (var fin = new FileStream(file.FullName, FileMode.Open, FileAccess.Read)) { fin.Read(dataBuffer, 0, dataBuffer.Length); } var offset_len = BytesToLong(dataBuffer[0], dataBuffer[1], dataBuffer[2], dataBuffer[3]); //Big Endian indexBuffer = new byte[offset_len]; Array.Copy(dataBuffer, 4, indexBuffer, 0, offset_len); dataLength = (int)offset_len; for (int loop = 0; loop < 256; loop++) { //索引 四字节 LITTLE_ENDIAN index[loop] = BytesToLong(indexBuffer[loop * 4 + 3], indexBuffer[loop * 4 + 2], indexBuffer[loop * 4 + 1], indexBuffer[loop * 4]); } } catch { } } public string[] Find(string ip) { var ips = ip.Split('.'); uint ip_prefix = uint.Parse(ips[0]); uint find_uint32 = BytesToLong(byte.Parse(ips[0]), byte.Parse(ips[1]), byte.Parse(ips[2]), byte.Parse(ips[3]));//BIG_ENDIAN // LITTLE_ENDIAN int max_len = 0; int resultOffset =-1; int resultLegth =-1; uint start = index[ip_prefix] * 8 + 1024; if (ip_prefix != 255) { max_len = (int)index[ip_prefix + 1] * 8 + 1024; } else { max_len = (int)index[255] * 8 + 1024+1; } for (; start < max_len; start += 8) { // 前四位 结束ip 后三位 偏移 最后一位 内容长度 uint endipNum = BytesToLong(indexBuffer[start + 0], indexBuffer[start + 1], indexBuffer[start + 2], indexBuffer[start + 3]);//BIG_ENDIAN if (endipNum >= find_uint32) { resultOffset =(int) BytesToLong((byte)0, indexBuffer[start + 6], indexBuffer[start + 5], indexBuffer[start + 4]);//LITTLE_ENDIAN resultLegth = 0xFF & indexBuffer[start + 7];// 长度 break; } } if (resultOffset==-1||resultLegth==-1) { return new string[] {"N/A"}; } var areaBytes = new byte[resultLegth]; Array.Copy(dataBuffer, dataLength + resultOffset - 1024, areaBytes, 0, resultLegth); return Encoding.UTF8.GetString(areaBytes).Split(' '); } private static uint BytesToLong(byte a, byte b, byte c, byte d) { return ((uint)a << 24) | ((uint)b << 16) | ((uint)c << 8) | (uint)d; } public static string long2IP(long longIP) { StringBuilder sb = new StringBuilder(""); sb.Append(longIP >> 24); sb.Append("."); //将高8位置0,然后右移16为 sb.Append((longIP & 0x00FFFFFF) >> 16); sb.Append("."); sb.Append((longIP & 0x0000FFFF) >> 8); sb.Append("."); sb.Append((longIP & 0x000000FF)); return sb.ToString(); } }
改进版 采用二分逼近 算法(类似二分查找,但又不同) 性能提升很大
public string[] Find(string ip) { var ips = ip.Split('.'); uint ip_prefix = uint.Parse(ips[0]); uint find_uint32 = BytesToLong(byte.Parse(ips[0]), byte.Parse(ips[1]), byte.Parse(ips[2]), byte.Parse(ips[3]));//BIG_ENDIAN uint max_len = 0; int resultOffset = -1; int resultLegth = -1; uint start = index[ip_prefix]; if (ip_prefix != 255) { max_len = index[ip_prefix + 1]; } else { max_len = index[255]; } uint num = max_len - start; uint my_index = BinarySearch(start, max_len, find_uint32); start = my_index * 8 + 1024; resultOffset = (int)BytesToLong((byte)0, indexBuffer[start + 6], indexBuffer[start + 5], indexBuffer[start + 4]);//LITTLE_ENDIAN resultLegth = 0xFF & indexBuffer[start + 7];// 长度 if (resultOffset == -1 || resultLegth == -1) { return new string[] { "N/A" }; } var areaBytes = new byte[resultLegth]; Array.Copy(dataBuffer, dataLength + resultOffset - 1024, areaBytes, 0, resultLegth); return Encoding.UTF8.GetString(areaBytes).Split(' '); } /// <summary> /// 二分逼近 /// </summary> public uint BinarySearch(uint low, uint high, uint k) { uint M = 0; while (low <= high) { uint mid = (low + high) / 2; uint endipNum = GetStartIp(mid); if (endipNum >= k) { M = mid; //mid有可能是解 high = mid - 1; } else low = mid + 1; } return M; }
有了上面高效算法 解析出来800多万数据 也很快 再用一个简单的ling 统计一下即可
var cn_result= from r in list group r by r.cn into g select new { key = g.Key, cnt = g.Count() };
800多万数据 统计组图
微信公众号:qqzeng-ip