用C++爬取网页

          做了好几天,终于写出来了,以前没有想到过,用C++也可以爬取网页,经过这么多天的努力终于做好了,解决了乱码问题。

从中学到很多,小到一个函数的参数,达到如何使用一个函数。

           还有C++中一直让人头疼的编码问题,unicode编码问题,研究了很多资料,又对Mutibytetowidechar和widechartomultibyte进行了重新的认识。

一个重要的关键是windows默认的是ANSI字符集,同时对HTML的格式进行了分析,以判断编码问题。

           感觉那么多天的辛苦没有白费,付出有了收获。不过在此,真的感谢那些牛人,期间也参考了他们的代码。

 代码:

#include <iostream>
#include <winsock2.h>
#include <cstring>
#include <fstream>
#pragma comment(lib,"ws2_32.lib")
 
using namespace std;
 
void getWebPage(char *url)
{
    SOCKET sock;
    WSADATA wsa;
    struct sockaddr_in  addrclient;
    ofstream of;
    WSAStartup(MAKEWORD(2,2),&wsa);
    of.open("temp.txt");
    if(!of)
    {
        cout<<"open fail!"<<endl;
        return;
    }
    static char content[100000]="";
    char myurl[256];
    char host[256];
    char dom[256];
    char header[256];
    char type[512];
    char *p;
    memset(myurl,'\0',256);
    memset(host,'\0',256);
    memset(dom,'\0',256);
    memset(header,'\0',256);
    memset(type,'\0',512);
    char *purl=0;
    struct hostent *phost;
    sock=socket(PF_INET,SOCK_STREAM,IPPROTO_TCP);
 
    strcpy(myurl,url);
    for(purl=myurl;*purl!='/'&&purl!='\0';++purl);
    if(int(purl-myurl)==strlen(myurl))
        strcpy(host,"/");
    else
        strcpy(host,purl);
    *purl='\0';
    strcpy(dom,myurl);
 
    cout<<dom<<endl;          //输出域名
    cout<<host<<endl;     //输出地址
    of<<dom<<endl;
    of<<host<<endl;
    phost=gethostbyname(dom);
         
    addrclient.sin_family=AF_INET;
    addrclient.sin_port=htons(80);
    addrclient.sin_addr.S_un.S_addr=*((unsigned long *)phost->h_addr);
     
    connect(sock,(struct sockaddr*)&addrclient,sizeof(addrclient));
     
    strcat(header, "GET ");
    strcat(header, host);
    strcat(header, " HTTP/1.1\r\n");
    strcat(header, "Host: ");
    strcat(header, dom);
    strcat(header, "\r\nConnection: Close\r\n\r\n");
    send(sock,header,strlen(header),0);
    recv(sock,type,512,0);
    cout<<type<<endl;
    of<<type;
    p=strstr(type,"utf-8");
    if(p)
    {
    memset(content,'\0',100000);
    while(recv(sock,content,100000,0)>0)
    {
        int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
        unsigned short * wszGBK = new unsigned short[len+1];
        memset(wszGBK, 0, len * 2 + 2);
        MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
        len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL); 
        char *szGBK=new char[len + 1];
        memset(szGBK, 0, len + 1);
        WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
        cout<<szGBK;
        of<<szGBK;
        strnset(content,'\0',100000);
        delete []wszGBK;
        delete [] szGBK;
    }
    }
    else
    {
        memset(type,'\0',512);
        recv(sock,type,512,0);
        cout<<type;
        of<<type;
        p=strstr(type,"gb2312");
        if(p)
        {
            while(recv(sock,content,100000,0))
            {
                cout<<content;
                of<<content;
                strnset(content,'\0',100000);
            }
        }
        else
        {
           while(recv(sock,content,100000,0)>0)
           {
               int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
               unsigned short * wszGBK = new unsigned short[len+1];
               memset(wszGBK, 0, len * 2 + 2);
               MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
               len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL); 
               char *szGBK=new char[len + 1];
               memset(szGBK, 0, len + 1);
               WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
               cout<<szGBK;
               of<<szGBK;
               strnset(content,'\0',100000);
               delete []wszGBK;
               delete [] szGBK;
           }
        }
    }
    closesocket(sock);
    WSACleanup();
    of.close();
    cout<<endl;
}
int main()
{
    char url[256];
    cout<<"http://";
    cin>>url;
    getWebPage(url);
    return 0;
}

 对此,又对socket编程产生了兴趣,socket编程魅力无穷。

posted @   xshang  阅读(4613)  评论(2编辑  收藏  举报
编辑推荐:
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
阅读排行:
· 单线程的Redis速度为什么快?
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 展开说说关于C#中ORM框架的用法!
· SQL Server 2025 AI相关能力初探
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
点击右上角即可分享
微信分享提示