多写原创文章,多分享,帮助他人,快乐自己!

如何打造网站克隆、仿站工具(C#版)

前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作, 
效率提高了200%以上,精确度也大大提高,虽然网上也很多网站克隆工具,但我觉得作为一个程序员,要有点研究精神,哈哈哈,可以根据自己的需要随意编写自己需要的功能。

下面我将我写的“网站克隆工具”实现方法分享给大家,源码在文末有下载链接,有需要的朋友可以下载来玩,也可以根据自己的需要做相应的修改或优化。

新打造的QQ群发器:https://www.cnblogs.com/jonlan/p/12866640.html

一睹为快,先看看界面:

 

简单的工作流程:

 

项目代码目录结构:

 

下面一步步实现程序功能:

 

1.新建主界面窗体(MainForm.cs):

2.新建模型类(UrlModel.cs)

1
2
3
4
5
6
7
8
9
10
11
public class UrlModel
    {
        public string RelatedPath { get; set; }
        public string AbsoluteUri { get; set; }
        public string CurrPath { get; set; }
        public string RootPath { get; set; }
 
        public string Host { get; set; }
        public int Port { get; set; }
        public string Scheme { get; set; }
    }

3.新建服务类(Services)

UrlParser:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
public class UrlParser
    {
        public static UrlModel Parse(string url)
        {
            UrlModel model = new UrlModel();
 
            //默认
            if (url.Length < 8)
                throw new Exception("url参数不正确");
            else if (!url.ToLower().StartsWith("http:") && !url.ToLower().StartsWith("https:"))
                throw new Exception("url格式有误");
 
            if (url.LastIndexOf('/') < 8)
                url = url + "/";
 
            Regex reg = new Regex("(?<scheme>(http|https))://(?<host>.+?)/", RegexOptions.Singleline);
 
            if (reg.IsMatch(url))
            {
                string scheme = reg.Match(url).Groups["scheme"].Value;
                string host = reg.Match(url).Groups["host"].Value;
                if (host.Contains(":"))
                {
                    var aa = host.Split(':');
                    if (aa.Length == 2)
                    {
                        model.Host = aa[0];
                        model.Port = int.Parse(aa[1]);
                    }
                }
                else
                {
                    model.Host = host;
                    model.Port = 80;
                }
 
                int index = url.IndexOf('/', 8);
 
                model.RelatedPath = url.Substring(index);
                model.AbsoluteUri = url;
                model.Scheme = scheme;
                model.CurrPath = url.Substring(0, url.LastIndexOf("/"));
 
                if (80 == model.Port)
                {
                    model.RootPath = string.Format("{0}://{1}", model.Scheme, model.Host);
                }
                else
                {
                    model.RootPath = string.Format("{0}://{1}:{2", model.Scheme, model.Host, model.Port);
                }
            }
            else
            {
                throw new Exception("url解析失败!");
            }
 
            return model;
        }
    }

WebPageService:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/// <summary>
    /// 网页处理服务工具
    /// </summary>
    public class WebPageService
    {
        private static string[] excludekeys = { "http:", "https:", "//", "#", "javascript:", "?", "tel:", "mailto:" };
        /// <summary>
        /// 获取所有html元素的href属性值,只获取站点本地的链接,站外的不获取
        /// </summary>
        /// <param name="html">页面的html源码</param>
        /// <returns></returns>
        public static List<UrlModel> GetLocalHrefs(string url,string html)
        {
            if (string.IsNullOrEmpty(html))
                return new List<UrlModel>();
 
            Dictionary<string, UrlModel> urls = GetHrefs(url,html);
            List<UrlModel> newUrls = new List<UrlModel>();
 
            if (null != urls)
            {
                foreach (string key in urls.Keys)
                {
                    string newkey = key.ToLower();
                    bool iscontained = false;
                    foreach (var exkey in excludekeys)
                    {
                        if (newkey.IndexOf(exkey) == 0)
                        {
                            iscontained = true;
                            break;
                        }
                    }
 
                    if (!iscontained) {
                        //只获取本地路径
                        newUrls.Add(urls[key]);
                    }
                }
            }
 
            return newUrls;
        }
 
        /// <summary>
        /// 获取所有html元素的src属性值,只获取站点本地的链接,站外的不获取
        /// </summary>
        /// <param name="html">页面的html源码</param>
        /// <returns></returns>
        public static List<UrlModel> GetLocalSrcs(string url,string html)
        {
            if (string.IsNullOrEmpty(html))
                return new List<UrlModel>();
 
            Dictionary<string, UrlModel> urls = GetSrc(url, html);
            List<UrlModel> newUrls = new List<UrlModel>();
 
            if (null != urls)
            {
                foreach (string key in urls.Keys)
                {
                    string newkey = key.ToLower();
                    bool iscontained = false;
                    foreach (var exkey in excludekeys)
                    {
                        if (newkey.IndexOf(exkey) == 0)
                        {
                            iscontained = true;
                            break;
                        }
                    }
 
                    if (!iscontained)
                    {
                        //只获取本地路径
                        newUrls.Add(urls[key]);
                    }
                }
            }
 
            return newUrls;
        }
 
        private static Dictionary<string, UrlModel> GetHrefs(string url,string html)
        {
            if (string.IsNullOrEmpty(html))
                return null;
 
            UrlModel currUrl = UrlParser.Parse(url);
            Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();
            Regex reg = new Regex("href=\"(?<Url>.+?)\"", RegexOptions.IgnoreCase);
             
            if (currUrl != null)
            {
                AddUrlModel(html, currUrl, urls, reg);
            }
 
            return urls;
        }
 
        private static Dictionary<string, UrlModel> GetSrc(string url,string html)
        {
            if (string.IsNullOrEmpty(html))
                return null;
 
            UrlModel currUrl = UrlParser.Parse(url);
            Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();
            Regex reg = new Regex("(src=\"(?<Url>.+?)\"|url\\((?<Url>.+?)\\))", RegexOptions.IgnoreCase);
 
            if (currUrl != null)
            {
                AddUrlModel(html, currUrl, urls, reg);
            }
 
            return urls;
        }
 
        private static void AddUrlModel(string html, UrlModel currUrl, Dictionary<string, UrlModel> urls, Regex reg)
        {
            if (reg.IsMatch(html))
            {
                MatchCollection matchs = reg.Matches(html);
                foreach (Match item in matchs)
                {
                    try
                    {
                        string strUrl = item.Groups["Url"].Value;
                        UrlModel model = new UrlModel();
                        model.RelatedPath = strUrl;
                        model.CurrPath = currUrl.CurrPath;
                        model.RootPath = currUrl.RootPath;
                        model.Scheme = currUrl.Scheme;
                        model.Port = currUrl.Port;
                        model.Host = currUrl.Host;
 
                        if (strUrl.StartsWith("/"))
                        {
                            //绝对目录情况下
                            model.AbsoluteUri = string.Format("{0}{1}", model.RootPath, model.RelatedPath);
                        }
                        else
                        {
                            //相对目录情况下
                            string currPath = model.CurrPath;
                            int depth = 0;
                            string path = model.RelatedPath;
 
                            if (path.StartsWith(".."))
                            {
                                try
                                {
                                    while (path.StartsWith(".."))
                                    {
                                        depth++;
                                        path = path.Substring(3);
                                        currPath = currPath.Substring(0, currPath.LastIndexOf("/"));
                                    }
 
                                    model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);
                                }
                                catch
                                {
 
                                }
                            }
                            else
                            {
                                model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);
                            }
 
                        }
 
                        strUrl = strUrl.Trim().ToLower();
 
                        urls.Add(strUrl, model);
                    }
                    catch
                    {
                    }
                }
            }
        }
    }

4.网页源码扒取类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
public class HttpTool
    {
        public static string HttpGet(string url, string referer, string encoding, out string msg)
        {
            msg = string.Empty;
            string result = string.Empty;
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
 
                //request.ContentType = "application/x-www-form-urlencoded";
                request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                request.Referer = referer;
                request.Method = "GET";
                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36";
                //request.Headers.Add("Accept-Language", "zh-cn");
                //request.Headers.Add("Accept-Encoding", "gzip,deflate");
 
                request.Timeout = 60000;//一分钟
 
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream responseStream = response.GetResponseStream();
                if (responseStream != null)
                {
                    StreamReader reader = new StreamReader(responseStream, System.Text.Encoding.GetEncoding(encoding));
                    result = reader.ReadToEnd();
                    reader.Close();
                    responseStream.Close();
                    request.Abort();
                    response.Close();
                    return result.Trim();
                }
            }
            catch (Exception ex)
            {
                msg = ex.Message + ex.StackTrace;
            }
 
            return result;
        }
 
        public static void DownFile(string uRLAddress, string localPath, string filename)
        {
            WebClient client = new WebClient();
            Stream str = client.OpenRead(uRLAddress);
            StreamReader reader = new StreamReader(str);
            byte[] mbyte = new byte[1000000];
            int allmybyte = (int)mbyte.Length;
            int startmbyte = 0;
 
            while (allmybyte > 0)
            {
                int m = str.Read(mbyte, startmbyte, allmybyte);
                if (m == 0)
                {
                    break;
                }
                startmbyte += m;
                allmybyte -= m;
            }
 
            reader.Dispose();
            str.Dispose();
 
            string path = Path.Combine(localPath, filename);
            FileStream fstr = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write);
            fstr.Write(mbyte, 0, startmbyte);
            fstr.Flush();
            fstr.Close();
        }
    }

5.网站克隆主类

接口:

interface IWebCloneWorker
    {
        void Start();
        void Cancel();
    }

 

 

实现类:

复制代码
public class WebCloneWorker : IWebCloneWorker
    {
        //网站页面克隆深度(如:0-首页,1-分类页,2-详细页面)
        public static int depth = 0;
        
        //要克隆的网站网址
        public string Url { get; set; }

        //克隆后,保存的路径
        public string SavePath { get; set; }

        private BackgroundWorker backgroundWorker1 = null;
        public event UrlChangedEventHandler UrlChanged;
        public event FileSavedSuccessEventHandler FileSavedSuccess;
        public event FileSavedFailEventHandler FileSavedFail;
        public event DownloadCompletedEventHandler DownloadCompleted;
        public event CollectingUrlEventHandler CollectingUrl;
        public event CollectedUrlEventHandler CollectedUrl;
        public event ProgressChangedEventHandler ProgressChanged;

        //所有页面、文件资源地址集合
        private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>();

        /// <summary>
        /// 所有页面、文件资源地址集合
        /// </summary>
        public Dictionary<string,UrlModel> Hrefs
        {
            get { return _Hrefs; }
            set { _Hrefs = value; }
        }

        //网站页面请求编码,默认为UTF-8
        private string _Encoding = "utf-8";

        //网站页面请求编码,默认为UTF-8
        public string Encoding
        {
            get { return _Encoding; }
            set { _Encoding = value; }
        }

        public WebCloneWorker() { }

        public WebCloneWorker(string url,string path) 
        {
            //设置网站、保存路径
            this.Url = url;
            this.SavePath = path;

            if (string.IsNullOrEmpty(this.Url))
                throw new Exception("请输入网址");

            if (string.IsNullOrEmpty(this.SavePath))
                throw new Exception("请选择要保存的目录");

            backgroundWorker1 = new BackgroundWorker();

            //设置报告进度更新
            backgroundWorker1.WorkerReportsProgress = true;
            backgroundWorker1.WorkerSupportsCancellation = true;

            //注册线程主体方法
            backgroundWorker1.DoWork += backgroundWorker1_DoWork;

            //注册更新UI方法
            backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged;

            //处理完毕
            backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted;
        }

        void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
        {
            if (e.Cancelled) {
                return;
            }

            if (this.DownloadCompleted != null)
            {
                DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled);
                this.DownloadCompleted(this, eventArgs);
            }
        }

        void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e)
        {
            //进度回调
            if (this.ProgressChanged != null) 
                this.ProgressChanged(this, e);

            UrlModel model = (UrlModel)e.UserState;

            if (this.UrlChanged != null)
            {
                //Url改变后,回调
                UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model);
                this.UrlChanged(this, eventArgs);
            }

            try
            {
                string dir = this.SavePath;
                string url = model.AbsoluteUri;
                string AbsolutePath = url.Substring(url.IndexOf('/', 8));
                string fileName = "";

                if (url.IndexOf('?') > 0)
                {
                    string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf('?'));
                    fileName = System.IO.Path.GetFileName(path);
                }
                else
                {
                    fileName = System.IO.Path.GetFileName(AbsolutePath);
                }

                //默认首页
                if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0)
                {
                    fileName = "index.html";

                    if (!AbsolutePath.EndsWith("/"))
                        AbsolutePath = AbsolutePath + "/";
                }

                fileName = System.Web.HttpUtility.UrlDecode(fileName);

                string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath));
                if (!System.IO.Directory.Exists(localPath))
                {
                    System.IO.Directory.CreateDirectory(localPath);
                }

                //判断文件是否存在,存在不再下载
                string path2 = Path.Combine(localPath, fileName);
                if (File.Exists(path2))
                {
                    return;
                }

                //下载网页、图片、资源文件
                HttpTool.DownFile(url, localPath, fileName);

                //保存成功后,回调
                if (this.FileSavedSuccess != null)
                {
                    FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model);
                    this.FileSavedSuccess(this, eventArgs);
                }
            }
            catch (Exception ex)
            {
                //保存失败后,回调
                if (this.FileSavedFail != null)
                {
                    FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex);
                    this.FileSavedFail(this, eventArgs);
                }
            }
        }

        void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            //获取资源
            GetResource();

            int index = 1;
            if (this.Hrefs.Keys.Count > 0)
            {
                foreach (var k in this.Hrefs.Keys)
                {
                    //取消操作
                    if (backgroundWorker1.CancellationPending)
                    {
                        e.Cancel = true;
                        return;
                    }

                    backgroundWorker1.ReportProgress(index, this.Hrefs[k]);
                    index++;

                    //挂起当前线程200毫秒
                    Thread.Sleep(200);
                }
            }
        }

        public void Start()
        {
            if (this.backgroundWorker1.IsBusy)
                return;

            this.backgroundWorker1.RunWorkerAsync();
        }

        public void Cancel()
        {
            if (this.backgroundWorker1.CancellationPending)
                return;

            this.backgroundWorker1.CancelAsync();
        }
        
        private void GetResource()
        {
            string url = this.Url;
            string referer = this.Url;
            string msg = "";
            string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg);

            //收集页面链接
            GetHrefs(0, url, html);

            //收集完毕
            if (null != CollectedUrl)
            {
                UrlModel urlModel = new UrlModel();
                CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel);
                this.CollectedUrl(this, eventArgs);
            }

        }

        private void GetHrefs(int level,string url,string html)
        {
            #region 添加当前页

            UrlModel currUrl = UrlParser.Parse(url);

            try
            {
                //取消
                if (backgroundWorker1.CancellationPending)
                    return;

                this.Hrefs.Add(currUrl.RelatedPath, currUrl);

                //收集回调
                if (null != CollectingUrl)
                {
                    CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl);
                    this.CollectingUrl(this, eventArgs);
                }
            }
            catch
            {
            }

            #endregion

            //获取相关链接(含有href属性的)
            List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html);

            //获取图片,文件等资源文件(含有src属性的)
            List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html);

            #region 获取当级资源文件

            if (listSrcs != null)
            {
                for (int i = 0; i < listSrcs.Count; i++)
                {
                    UrlModel urlModel = listSrcs[i];
                    try
                    {
                        //取消
                        if (backgroundWorker1.CancellationPending) 
                            return;

                        this.Hrefs.Add(urlModel.RelatedPath, urlModel);

                        //收集回调
                        if (null != CollectingUrl)
                        {
                            CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
                            this.CollectingUrl(this, eventArgs);
                        }
                    }
                    catch
                    { }
                }
            }

            #endregion

            #region 获取子级页面资源

            //获取第二级
            if (list1 != null)
            {
                for (int i = 0; i < list1.Count; i++)
                {
                    UrlModel urlModel = list1[i];

                    try
                    {
                        //取消
                        if (backgroundWorker1.CancellationPending)
                            return;

                        this.Hrefs.Add(urlModel.RelatedPath, urlModel);

                        //收集回调
                        if (null != CollectingUrl)
                        {
                            CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
                            this.CollectingUrl(this, eventArgs);
                        }
                    }
                    catch
                    { }

                    string msg = "";
                    html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg);

                    #region 获取子级资源文件

                    /*
                     * 获取二级资源文件
                     * */
                    listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//资源文件

                    if (listSrcs != null)
                    {
                        for (int j = 0; j < listSrcs.Count; j++)
                        {
                            UrlModel urlModel2 = listSrcs[j];

                            try
                            {
                                //取消
                                if (backgroundWorker1.CancellationPending)
                                    return;

                                this.Hrefs.Add(urlModel2.RelatedPath, urlModel2);

                                //收集回调
                                if (null != CollectingUrl)
                                {
                                    CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2);
                                    this.CollectingUrl(this, eventArgs);
                                }
                            }
                            catch
                            { }

                            //挂起线程20毫秒
                            Thread.Sleep(20);
                        }
                    }
                    #endregion

                    //挂起线程20毫秒
                    Thread.Sleep(20);

                    //到达指定深度后,退出
                    if (level >= depth)
                        return;

                    //递归
                    GetHrefs(level + 1, urlModel.AbsoluteUri, html);
                }
            }

            #endregion
        }
    }
复制代码

 

6.一些事件、委托类:

public delegate void UrlChangedEventHandler(object sender, UrlChangedEventArgs e);
    public delegate void FileSavedSuccessEventHandler(object sender, FileSavedSuccessEventArgs e);
    public delegate void FileSavedFailEventHandler(object sender, FileSavedFailEventArgs e);
    public delegate void DownloadCompletedEventHandler(object sender, DownloadCompletedEventArgs e);
    public delegate void CollectingUrlEventHandler(object sender, CollectingUrlEventArgs e);
    public delegate void CollectedUrlEventHandler(object sender, CollectedUrlEventArgs e);
    public delegate void ProgressChangedEventHandler(object sender, ProgressChangedEventArgs e);

 

public class CollectedUrlEventArgs : EventArgs
public class CollectingUrlEventArgs : EventArgs
public class DownloadCompletedEventArgs : EventArgs
public class FileSavedFailEventArgs : EventArgs
public class FileSavedSuccessEventArgs : EventArgs
public class UrlChangedEventArgs : EventArgs

 

代码有点多,各位有需要的还是下载源码查看并运行吧,由于赶时间,没时间仔细测试程序的各个功能,难免有不足的地方。

百度网盘:链接:https://pan.baidu.com/s/1hja1rl9UEcl0dzTqVFt0dg 密码:关注公众号“算你牛”回复“网站克隆”获取密码。

 

posted @   jonlan  阅读(3013)  评论(11编辑  收藏  举报
编辑推荐:
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· 展开说说关于C#中ORM框架的用法!
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
多写原创文章,多分享,帮助他人,快乐自己!
点击右上角即可分享
微信分享提示