iOS开发之html解析
使用XPath解析html
可以从此处https://github.com/topfunky/hpple下载工程,将TFHpple.h,TFHpple.m,TFHppleElement.h,TFHppleElement.m,XPathQuery.h,XPathQuery.m加到自己的项目中,在Frameworks中导入libxml2.x
在项目中找到Other Linker Flags,加入-libxml2
代码如下:
1 NSString *urlString = nil; 2 3 urlString = @"http://www.xiyou.edu.cn/new/lm.jsp?urltype=tree.TreeTempUrl&wbtreeid=724"; 4 5 NSData *htmlData = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:urlString]]; 6 7 NSData *toHtmlData = [self toUTF8:htmlData]; 8 9 TFHpple *xpathParser = [[TFHpple alloc] initWithHTMLData:toHtmlData]; 10 11 12 NSArray *aArray = [xpathParser searchWithXPathQuery:@"//a"]; 13 14 if ([span count] > 0) { 15 16 for (int i = 87; i < 102; i++) { 17 //从<a>的第82个开始取值,共获取15个值 18 TFHppleElement *aElement = [aArray objectAtIndex:i]; 19 NSArray *aArr = [aElement children]; 20 TFHppleElement *aEle = [aArr objectAtIndex:0]; 21 NSArray *aChild = [aEle children]; 22 TFHppleElement *aChildEle = [aChild objectAtIndex:0]; 23 NSArray *aChildren = [aChildEle children]; 24 NSString *aStr = [[aChildren objectAtIndex:0] content]; 25 NSLog(@"aStr:%@",aStr); 26 NSDictionary *aAttributeDict = [aElement attributes]; 27 NSLog(@"aAttributeDict:%@",aAttributeDict); 28 29 //获取a中的属性值 30 NSString *hrefStr = [NSString stringWithFormat:@"http://www.xiyou.edu.cn%@",[aAttributeDict objectForKey:@"href"]]; 31 NSLog(@"hrefStr:%@",hrefStr); 32 33 [currentNewsArr addObject:aStr]; 34 [currentHrefArr addObject:hrefStr]; 35 36 } 37 [htmlData release]; 38 [xpathParser release]; 39 } 40 41 //如果解析的网页不是utf8编码,如gbk编码,可以先将其转换为utf8编码再对其进行解析 42 43 -(NSData *) toUTF8:(NSData *)sourceData { 44 CFStringRef gbkStr = CFStringCreateWithBytes(NULL, [sourceData bytes], [sourceData length],kCFStringEncodingGB_18030_2000, false); 45 46 if (gbkStr == NULL) { 47 return nil; 48 } else { 49 NSString *gbkString = (NSString *)gbkStr; 50 //根据网页源代码中编码方式进行修改,此处为从gbk转换为utf8 51 NSString *utf8_String = [gbkString stringByReplacingOccurrencesOfString:@"META http-equiv="Content-Type" content="text/html; charset=GBK"" 52 withString:@"META http-equiv="Content-Type" content="text/html; charset=UTF-8""]; 53 54 return [utf8_String dataUsingEncoding:NSUTF8StringEncoding]; 55 } 56 }
http://blog.csdn.net/majiakun1/article/details/39472489