[转]libxml中文解决方案

网摘1:

使用libxml2处理xml文件时,默认加载是使用utf-8编码,所以在修改和保存为GB2312编码时,需要将数据转换为utf-8编码,然后再进行修改和保存!
转换使用iconv,以下是转换代码
char * ConvertEnc( char *encFrom, char *encTo, const char * in)
{
 
 static char bufin[1024], bufout[1024], *sin, *sout;
 int mode, lenin, lenout, ret, nline;
 iconv_t c_pt;
 
 if ((c_pt = iconv_open(encTo, encFrom)) == (iconv_t)-1)
 {
  printf("iconv_open false: %s ==> %s", encFrom, encTo);
  return NULL;
 }
 iconv(c_pt, NULL, NULL, NULL, NULL);
 
 lenin  = strlen(in) + 1;
 lenout = 1024;
 sin    = (char *)in;
 sout   = bufout;
 ret = iconv(c_pt, &sin, (size_t *)&lenin, &sout, (size_t *)&lenout);
               
 if (ret == -1)
 {
  return NULL;
 }
 iconv_close(c_pt);
 
 return bufout;
}
以下是实例
test.xml
<?xml version="1.0" encoding="gb2312"?>
<parent>测试</parent>
读取代码
int main(void)
{
        xmlDocPtr doc = NULL;
        xmlNodePtr cur = NULL;
        doc = xmlParseFile("test.xml");
        cur = xmlDocGetRootElement(doc);
        printf("%s", Convert("utf-8", "gb2312", (char *)xmlNodeGetContent(cur)));
}
修改、保存代码
test2.xml
<?xml version="1.0" encoding="gb2312"?>
<story>
  <storyinfo>
    <author>John Fleck</author>
    <datewritten>June 2, 2002</datewritten>
    <keyword>我来也 example keyword</keyword>
    <书目> C++ </书目>
    <测试> test </测试>
  </storyinfo>
  <body>
    <headline>This is the headline</headline>
    <para>This is the body text.</para>
  </body>
</story>

xmlDocPtr
parseDoc(char *docname, char *uri) {
 xmlDocPtr doc;
 xmlNodePtr cur;
 xmlNodePtr newnode;
 xmlAttrPtr newattr;
 doc = xmlParseFile(docname);
 
 if (doc == NULL ) {
  fprintf(stderr,"Document not parsed successfully. ");
  return (NULL);
 }
 
 cur = xmlDocGetRootElement(doc);
 
 if (cur == NULL) {
  fprintf(stderr,"empty document");
  xmlFreeDoc(doc);
  return (NULL);
 }
 
 if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
  fprintf(stderr,"document of the wrong type, root node != story");
  xmlFreeDoc(doc);
  return (NULL);
 }
 
 newnode = xmlNewTextChild (cur, NULL, (xmlChar *)"reference", NULL);
 newattr = xmlNewProp (newnode, (xmlChar *)"uri", (xmlChar *)uri);
 return(doc);
}
int main(int argc, char ** argv )
{
    int options = 0;
    xmlDocPtr doc = NULL;
    char * output = NULL;
    char * szNode = NULL;
    int ret = 0;
   
    xmlAddEncodingAlias("UTF-8", "DVEnc");
    xmlKeepBlanksDefault(0);
 defaultEntityLoader = xmlGetExternalEntityLoader();
 xmlSetExternalEntityLoader(xmllintExternalEntityLoader);
 xmlLineNumbersDefault(1);
 
 szNode = Convert("gb2312", "utf-8", "测试" );
 doc = parseDoc( argv[2], szNode);
 
 ret = xmlSaveFormatFileEnc(output ? output : "-", doc, argv[1], 1);
 if (ret < 0)
 {
      fprintf(stderr, "failed save to %s",
              output ? output : "-");
 }
 return 0;
}
执行
xmlout gb2312 test2.xml
结果
<?xml version="1.0" encoding="gb2312"?>
<story>
  <storyinfo>
    <author>John Fleck</author>
    <datewritten>June 2, 2002</datewritten>
    <keyword>我来也 example keyword</keyword>
    <书目> C++ </书目>
    <测试> test </测试>
  </storyinfo>
  <body>
    <headline>This is the headline</headline>
    <para>This is the body text.</para>
  </body>
  <reference uri="测试"/>
</story>

 

网摘2:

LibXML2自身已经支持了中文编码.只是他的所有api处理的数据都是UTF-8类型的,所以只要在读入和写入数据时进行相应转换即可!代码1是使用 linux下C API进行编码转换;代码2因为libxml2已融合了iconv,使用了libxml2的函数来进行编码转换.

/*
compile: gcc -I/usr/include/libxml2/ -lxml2 iconv.c
input:
   test.xml
<?xml version="1.0" encoding="gb2312"?>
<parent>测试</parent>
output:
    测试
1)     iconv
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <arpa/inet.h>
#include <iconv.h>
char * Convert( char *encFrom, char *encTo, const char * in)
{
    static char bufin[1024], bufout[1024], *sin, *sout;
    int mode, lenin, lenout, ret, nline;
    iconv_t c_pt;
    if ((c_pt = iconv_open(encTo, encFrom)) == (iconv_t)-1)
    {
       printf("iconv_open false: %s ==> %s\n", encFrom, encTo);
       return NULL;
    }
    iconv(c_pt, NULL, NULL, NULL, NULL);
    lenin = strlen(in) + 1;
    lenout = 1024;
    sin   = (char *)in;
    sout   = bufout;
    ret = iconv(c_pt, &sin, (size_t *)&lenin, &sout, (size_t *)&lenout);
         
    if (ret == -1)
    {
       return NULL;
    }
    iconv_close(c_pt);
    return bufout;
}
int main(void)
{
        xmlDocPtr doc = NULL;
        xmlNodePtr cur = NULL;
        doc = xmlParseFile("test.xml");
        cur = xmlDocGetRootElement(doc);
        printf("%s\n", (char *)xmlNodeGetContent(cur));
        //printf("%s\n", Convert("utf-8", "gb2312", (char *)xmlNodeGetContent(cur)));
}


2)       xmlFindCharEncodingHandler
使用数据类型:xmlCharEncodingHandlerPtr
/*******************************************
* compile: gcc -I/usr/include/libxml2/ -lxml2 convert.c
* usage: convert utf-8 string or null
* input: ./convert 测试
* output:
[wuqifu@localhost test]$ ./convert 测试
ISO-8859-1:虏芒脢脭
<?xml version="1.0" encoding="ISO-8859-1"?>
<root>测试</root>
*******************************************/
#include <libxml/encoding.h>
/**
* function name: ConvertInput
* input:
    @in: string in a given encoding
    @encoding: the encoding used
* description: Converts @in into UTF-8 for processing with libxml2 APIs
* return: returns the converted UTF-8 string, or NULL in case of error.
**/
unsigned char* ConvertInput(const char *in, const char *encoding)
{
    unsigned char *out;
    int ret;
    int size;
    int out_size;
    int temp;
    xmlCharEncodingHandlerPtr handler;
    if (in == 0)
        return 0;
    handler = xmlFindCharEncodingHandler(encoding);
    if (!handler) {
        printf("ConvertInput: no encoding handler found for '%s'\n",
               encoding ? encoding : "");
        return 0;
    }
    size = (int) strlen(in) + 1;
    out_size = size * 2 - 1;
    out = (unsigned char *) xmlMalloc((size_t) out_size);
    if (out != 0) {
        temp = size - 1;
        ret = handler->input(out, &out_size, (const unsigned char *) in, &temp);
        if ((ret < 0) || (temp - size + 1)) {
            if (ret < 0) {
                printf("ConvertInput: conversion wasn't successful.\n");
            } else {
                printf
                    ("ConvertInput: conversion wasn't successful. converted: %i octets.\n",
                     temp);
            }
            xmlFree(out);
            out = 0;
        } else {
            out = (unsigned char *) xmlRealloc(out, out_size + 1);
            out[out_size] = 0; /*null terminating out */
        }
    } else {
        printf("ConvertInput: no mem\n");
    }
    return out;
}
int    main(int argc, char **argv)
{
       unsigned char *content, *out;
       xmlDocPtr doc;
       xmlNodePtr rootnode;
       char *encoding = "ISO-8859-1";   //utf-8, ISO-8859-1
       if (argc <= 1) {
              printf("Usage: %s content\n", argv[0]);
              return(0);
       }
       content = argv[1];
       out = ConvertInput(content, encoding);
       printf( "%s:%s\n", encoding, out );
       doc = xmlNewDoc ("1.0");
       rootnode = xmlNewDocNode(doc, NULL, (const xmlChar*)"root", out);
       xmlDocSetRootElement(doc, rootnode);
       xmlSaveFormatFileEnc("-", doc, encoding, 1);
       return (1);
}

posted @ 2010-02-22 14:36  喝水的牛儿  阅读(1743)  评论(0编辑  收藏  举报