iOS 解析 HTML

xml,json都有大量的库来解析,我们如何解析html呢?

TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。

今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

001 // NSData data contains the document data 
002 // encoding is the NSStringEncoding of the data 
003 // baseURL the documents base URL, i.e. location  
004     
005 CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding); 
006 CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc); 
007 const char*enc = CFStringGetCStringPtr(cfencstr, 0); 
008     
009 htmlDocPtr _htmlDocument = htmlReadDoc([data bytes], 
010       [[baseURL absoluteString] UTF8String], 
011       enc, 
012       XML_PARSE_NOERROR | XML_PARSE_NOWARNING); 
013 if(_htmlDocument) 
014
015    xmlFreeDoc(_htmlDocument); 
016
017    
018 xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument; 
019    
020 while(currentNode)  
021     
022         // output node if it is an element 
023            
024         if(currentNode->type == XML_ELEMENT_NODE) 
025         
026             NSMutableArray *attrArray = [NSMutableArray array]; 
027                
028             for(xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next) 
029             
030                 xmlNodePtr contents = attrNode->children; 
031                    
032                 [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]]; 
033             
034                
035             NSString *attrString = [attrArray componentsJoinedByString:@" "];  
036                
037             if([attrString length]) 
038             
039                 attrString = [@" "stringByAppendingString:attrString]; 
040             
041                
042             NSLog(@"<%s%@>", currentNode->name, attrString); 
043         
044         else if(currentNode->type == XML_TEXT_NODE) 
045         
046             //NSLog(@"%s", currentNode->content); 
047             NSLog(@"%@", [NSString stringWithCString:(const char*)currentNode->content encoding:NSUTF8StringEncoding]); 
048         
049         else if(currentNode->type == XML_COMMENT_NODE) 
050         
051             NSLog(@"/* %s */", currentNode->name); 
052         
053        
054            
055         if(currentNode && currentNode->children) 
056         
057             currentNode = currentNode->children; 
058         
059         else if(currentNode && currentNode->next) 
060         
061             currentNode = currentNode->next; 
062         
063         else 
064         
065             currentNode = currentNode->parent; 
066                
067             // close node 
068             if(currentNode && currentNode->type == XML_ELEMENT_NODE) 
069             
070                 NSLog(@"</%s>", currentNode->name); 
071             
072                
073             if(currentNode->next) 
074             
075                 currentNode = currentNode->next; 
076             
077             else  
078             
079                 while(currentNode) 
080                 
081                     currentNode = currentNode->parent; 
082                     if(currentNode && currentNode->type == XML_ELEMENT_NODE) 
083                     
084                         NSLog(@"</%s>", currentNode->name); 
085                         if (strcmp((const char*)currentNode->name, "table") == 0) 
086                         
087                             NSLog(@"over"); 
088                         
089                     
090                        
091                     if(currentNode == nodes->nodeTab[0]) 
092                     
093                         break
094                     
095                        
096                     if(currentNode && currentNode->next) 
097                     
098                         currentNode = currentNode->next; 
099                         break
100                     
101                 
102             
103         
104            
105         if(currentNode == nodes->nodeTab[0]) 
106         
107             break
108         
109     }

 

不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents.  还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",

所以我写了这个方法,同时修改node属性的content key.

01 NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult) 
02
03     NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary]; 
04        
05     if(currentNode->name) 
06     
07         NSString *currentNodeContent = 
08         [NSString stringWithCString:(const char*)currentNode->name encoding:NSUTF8StringEncoding]; 
09         [resultForNode setObject:currentNodeContent forKey:@"nodeName"]; 
10     
11        
12     if(currentNode->content) 
13     
14         NSString *currentNodeContent = [NSString stringWithCString:(const char*)currentNode->content encoding:NSUTF8StringEncoding]; 
15            
16         if(currentNode->type == XML_TEXT_NODE) 
17         
18             if(currentNode->parent->type == XML_ELEMENT_NODE) 
19             
20                 [parentResult setObject:currentNodeContent forKey:@"nodeContent"]; 
21                 returnnil; 
22             
23                
24             if(currentNode->parent->type == XML_ATTRIBUTE_NODE) 
25             
26                 [parentResult 
27                  setObject: 
28                  [currentNodeContent 
29                   stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] 
30                  forKey:@"attributeContent"]; 
31                 returnnil; 
32    
33             
34         
35     
36        
37    
38        
39     xmlAttr *attribute = currentNode->properties; 
40     if(attribute) 
41     
42         NSMutableArray *attributeArray = [NSMutableArray array]; 
43         while(attribute) 
44         
45             NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary]; 
46             NSString *attributeName = 
47             [NSString stringWithCString:(const char*)attribute->name encoding:NSUTF8StringEncoding]; 
48             if(attributeName) 
49             
50                 [attributeDictionary setObject:attributeName forKey:@"attributeName"]; 
51             
52                
53             if(attribute->children) 
54             
55                 NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary); 
56                 if(childDictionary) 
57                 
58                     [attributeDictionary setObject:childDictionary forKey:@"attributeContent"]; 
59                 
60             
61                
62             if([attributeDictionary count] > 0) 
63             
64                 [attributeArray addObject:attributeDictionary]; 
65             
66             attribute = attribute->next; 
67         
68            
69         if([attributeArray count] > 0) 
70         
71             [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"]; 
72         
73     
74        
75     xmlNodePtr childNode = currentNode->children; 
76     if(childNode) 
77     
78         NSMutableArray *childContentArray = [NSMutableArray array]; 
79         while(childNode) 
80         
81             NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode); 
82             if(childDictionary) 
83             
84                 [childContentArray addObject:childDictionary]; 
85             
86             childNode = childNode->next; 
87         
88         if([childContentArray count] > 0) 
89         
90             [resultForNode setObject:childContentArray forKey:@"nodeChildArray"]; 
91         
92     
93        
94     returnresultForNode; 
95 }

 

TFHppleElement.m里加了两个key 常量

1 NSString * constTFHppleNodeAttributeContentKey  = @"attributeContent"
2 NSString * const TFHppleNodeChildArrayKey        = @"nodeChildArray";

 

并修改获取属性方法为:

1 - (NSDictionary *) attributes 
2
3   NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary]; 
4   for(NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) { 
5     [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey] 
6                              forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]]; 
7   
8   returntranslatedAttributes; 
9 }

 

并添加获取children node 方法:

01 - (BOOL) hasChildren 
02
03     NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey]; 
04        
05     if(childs)  
06     
07         return YES; 
08     
09        
10     return NO; 
11
12    
13 - (NSArray *) children 
14
15     if([self hasChildren]) 
16         return[node objectForKey: TFHppleNodeChildArrayKey]; 
17     returnnil; 
18 }

 

最后我还加了一个获取所有content的主法:

1 - (NSString *)contentsAt:(NSString *)xPathOrCss;

 

请看 源码

posted @ 2013-01-15 17:54  GreyWolf  阅读(282)  评论(0编辑  收藏  举报