Wechat.Crawler/App/App.csproj
<Project Sdk="Microsoft.NET.Sdk"> <ItemGroup> <ProjectReference Include="..\Blog\Blog.csproj" /> </ItemGroup> <ItemGroup> <None Update="nlog.config" CopyToOutputDirectory="Always" /> </ItemGroup> <ItemGroup> <PackageReference Include="AngleSharp" Version="1.2.0-beta.410" /> <PackageReference Include="HtmlAgilityPack" Version="1.11.60" /> <PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="8.0.0" /> <PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="8.0.0" /> <PackageReference Include="NLog.Extensions.Logging" Version="5.3.8" /> <PackageReference Include="Selenium.Support" Version="4.18.1" /> <PackageReference Include="Selenium.WebDriver" Version="4.18.1" /> </ItemGroup> <PropertyGroup> <OutputType>Exe</OutputType> <TargetFramework>net8.0</TargetFramework> <ImplicitUsings>enable</ImplicitUsings> <Nullable>enable</Nullable> </PropertyGroup> </Project>
Wechat.Crawler/App/Program.cs
using System.Runtime.InteropServices; using System.Text.RegularExpressions; using AngleSharp.Html.Dom; using AngleSharp.Html.Parser; using HtmlAgilityPack; using Microsoft.Extensions.Logging; using NLog.Config; using NLog.Extensions.Logging; using OpenQA.Selenium; using OpenQA.Selenium.Edge; using OpenQA.Selenium.Support.UI; using Wechat.Crawler.Blog; internal class Program { // 定义一个常量 const string WechatFolder = "/Users/song/Code/Wechat.Crawler/Out"; private static async Task Main(string[] args) { await DownLoad(); // ExtractLinks(); // ExtractWechatAccounts(); // 将html转换txt文件 // ConvertHtmlToTxt(); } private static void ConvertHtmlToTxt() { foreach (string file in Directory.GetFiles(WechatFolder, "*.html", SearchOption.AllDirectories)) { // 在这里处理每个文件,例如打印文件名 var folderPath = Path.GetDirectoryName(file); var fileName = Path.GetFileNameWithoutExtension(file); var txtFilePath = Path.Combine(folderPath!, fileName + ".txt"); var articleContent = ExtractTxtContent(file); if (articleContent == "No Content\nNo Content\nNo Content") continue; // 使用StreamWriter来写入文本 using (StreamWriter writer = new StreamWriter(txtFilePath)) { writer.WriteLine(articleContent); } } } private static string ExtractTxtContent(string filePath) { // 首先,确保已经通过NuGet安装了HtmlAgilityPack // 然后,创建HtmlDocument实例 HtmlDocument htmlDoc = new HtmlDocument(); // 设置一些选项,例如修复嵌套标签 htmlDoc.OptionFixNestedTags = true; // 加载HTML文件 htmlDoc.Load(filePath); var header = GetContentById(htmlDoc, "activity-name"); var publishTime = GetContentById(htmlDoc, "publish_time"); var content = GetContentById(htmlDoc, "js_content"); return header + "\n" + publishTime + "\n" + content; } private static string GetContentById(HtmlDocument htmlDoc, string id) { // 使用GetElementbyId方法获取id为"content"的元素 // HtmlNode contentNode = htmlDoc.GetElementbyId("js_content"); HtmlNode contentNode = htmlDoc.GetElementbyId(id); // 检查节点是否存在 if (contentNode != null) { // 提取并打印id为"content"的元素的内容 string content = contentNode.InnerText; // 或者使用contentNode.InnerHtml; return content.Trim() .Replace(" ", "") .Replace("?", "?\n") // .Replace("!","!\n") .Replace("1.", "\n1.") .Replace("1、", "\n1、") .Replace("。", "。\n"); } else { Console.WriteLine($"Element with id={id} not found."); return "No Content"; } } private static string GetContentByClass(HtmlDocument htmlDoc, string className) { // 使用XPath查询通过类名选择元素 // 假设您要查找的类名为"yourClassName" var nodesWithClass = htmlDoc.DocumentNode.SelectNodes($"//span[contains(@class, '{className}')]"); // 检查是否找到了元素 if (nodesWithClass != null) { foreach (var node in nodesWithClass) { // 提取并打印每个找到的元素的内容 string content = node.InnerText; // 或者使用node.InnerHtml; Console.WriteLine(content); } } Console.WriteLine("No elements with the specified class name were found."); return "No Content"; } private static async Task DownLoad() { var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>(); var db = new WechatDbContext(); db.Database.EnsureCreated(); var articles = db.Articles.Select(t => t).Where(t => t.IsDownload == false).OrderBy(t => t.Biz).ToList(); foreach (var article in articles) { var folderPath = Path.Combine(WechatFolder, article.Biz!.Trim('=')); // 判断文件夹是否存在 if (!Directory.Exists(folderPath)) { // 如果文件夹不存在,则创建它 Directory.CreateDirectory(folderPath); Console.WriteLine("文件夹已创建: " + folderPath); } var (fileName, content) = await DownLoadPageAsync(article.Url!); var filePath = Path.Combine(folderPath, $"{article.Mid.ToString()!}_{fileName}.html"); File.WriteAllText(filePath, content); article.IsDownload = true; db.SaveChanges(); break; } db.Dispose(); } private static async Task<(string fileName, string html)> DownLoadPageAsync(string url) { var options = new EdgeOptions(); options.AddArgument("--headless=new"); IWebDriver driver = new EdgeDriver(options); driver.Navigate().GoToUrl(url); driver.Manage().Timeouts().ImplicitWait = TimeSpan.FromMilliseconds(5000); await Task.Delay(3000); var html = driver.PageSource; // 创建HTML解析器 var parser = new HtmlParser(); // 解析HTML内容 var document = await parser.ParseDocumentAsync(html); // 查找id为"name"的h1元素 var h1Element = document.QuerySelector("#activity-name"); var fileName = "默认名称"; // 检查是否找到了元素,并且它确实是一个h1元素 if (h1Element != null) { fileName = ReplaceInvalidFileNameChars(h1Element.TextContent.Trim()); } driver.Quit(); return (fileName, html); } public static string ReplaceInvalidFileNameChars(string fileName) { // 定义Windows中不允许的字符 string invalidChars = new string(Path.GetInvalidFileNameChars()); // 使用正则表达式替换所有无效字符为下划线 string regexPattern = "[" + Regex.Escape(invalidChars) + "]"; string newFileName = Regex.Replace(fileName, regexPattern, "_"); // 去除文件名开头和结尾的下划线(如果有的话) newFileName = newFileName.TrimStart('_').TrimEnd('_'); return newFileName; } private static void ExtractWechatAccounts() { var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>(); var db = new WechatDbContext(); db.Database.EnsureCreated(); // 使用LINQ查询所有的biz字段 var allBiz = db.Articles.Select(a => a.Biz).Distinct().ToList(); // 打印所有的biz foreach (var biz in allBiz) { Console.WriteLine(biz); var w = db.WechatAccounts.Select(w => w).Where(w => w.Biz == biz).FirstOrDefault(); if (w != null) { w.Name = biz; } else { db.WechatAccounts.Add(new WechatAccount { Biz = biz, Name = biz, EndDate = DateTime.Now }); } } db.SaveChanges(); db.Dispose(); } private static void ExtractLinks() { var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>(); var db = new WechatDbContext(); db.Database.EnsureCreated(); string directoryPath = @"/Users/song/Code/Wechat.Crawler/Links"; string[] filePaths = Directory.GetFiles(directoryPath, "*.txt"); foreach (string filePath in filePaths) { string pattern = @"http:\/\/(mp.weixin.qq.com\/s\?__biz=).+(wechat_redirect)"; using (StreamReader reader = new StreamReader(filePath)) { string line; while ((line = reader.ReadLine()!) != null) { MatchCollection matches = Regex.Matches(line, pattern); foreach (Match match in matches) { if (match.Value.Count() < 210 || match.Value.Count() > 220) continue; var url = match.Value; string patternUrl = @"(?<key>\w+)=(?<value>[^&]+)"; var paramMatches = Regex.Matches(url, patternUrl); string biz = string.Empty; long mid = 0; foreach (Match m in paramMatches) { string key = m.Groups["key"].Value; if (key == "idx" || key == "sn" || key == "chksm" || key == "scene") continue; string value = m.Groups["value"].Value; if (key == "__biz") { biz = value; } if (key == "mid") { mid = long.Parse(value); } } if (biz == string.Empty || mid == 0) { logger.LogError($"该http不是有效的链接"); } var existingArticle = db.Articles.FirstOrDefault(a => a.Mid == mid); if (existingArticle == null) { var article = new Article { Biz = biz, Mid = mid, Url = url }; db.Articles.Add(article); db.SaveChanges(); logger.LogInformation($"Mid = {mid} 添加成功."); } else { logger.LogWarning($"Mid = {mid} already exists."); } } } } } db.Dispose(); } }
Wechat.Crawler/Topic/Topic.csproj
<Project Sdk="Microsoft.NET.Sdk"> <PropertyGroup> <TargetFramework>net8.0</TargetFramework> <ImplicitUsings>enable</ImplicitUsings> <Nullable>enable</Nullable> </PropertyGroup> </Project>
Wechat.Crawler/Topic/Class1.cs
namespace Topic; public class Class1 { }
Wechat.Crawler/Blog/Blog.csproj
<Project Sdk="Microsoft.NET.Sdk"> <PropertyGroup> <TargetFramework>net8.0</TargetFramework> <ImplicitUsings>enable</ImplicitUsings> <Nullable>enable</Nullable> </PropertyGroup> <ItemGroup> <PackageReference Include="Microsoft.Data.Sqlite.Core" Version="8.0.3" /> <PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="8.0.3" /> <PackageReference Include="Microsoft.entityframeworkcore.tools" Version="8.0.3"> <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets> <PrivateAssets>all</PrivateAssets> </PackageReference> </ItemGroup> </Project>
Wechat.Crawler/Blog/WechatDbContext.cs
using Microsoft.EntityFrameworkCore; namespace Wechat.Crawler.Blog; public class WechatDbContext : DbContext { public DbSet<Article> Articles { get; set; } public DbSet<WechatAccount> WechatAccounts { get; set; } protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder) { if (!optionsBuilder.IsConfigured) { //不同的数据库,只需要修改此处的连接字符串即可 optionsBuilder.UseSqlite(@"Data Source=/Users/song/Code/Wechat.Crawler/Blog/Db/01.db"); } } protected override void OnModelCreating(ModelBuilder modelBuilder) { modelBuilder.Entity<Article>().ToTable("Article"); modelBuilder.Entity<WechatAccount>().ToTable("Wechat"); modelBuilder.Entity<Article>().HasIndex(e => e.Mid).IsUnique(); // mid唯一 modelBuilder.Entity<WechatAccount>().HasIndex(e => e.Biz).IsUnique(); // biz唯一 } }
Wechat.Crawler/Blog/Entities/Article.cs
using System; using System.Collections.Generic; using System.Numerics; using System.Text; namespace Wechat.Crawler.Blog; public class Article { public Guid Id { get; set; } = new Guid(); public string? Biz { get; set; } public long? Mid { get; set; } public bool IsDownload { get; set; } = false; public bool IsXunFei { get; set; } = false; public string? Url { get; set; } }
Wechat.Crawler/Blog/Entities/Wechat.cs
namespace Wechat.Crawler.Blog; public class WechatAccount { public Guid Id { get; set; } = new Guid(); public string? Biz { get; set; } public string? Name { get; set; } public DateTime? StartDate { get; set; } public DateTime? EndDate { get; set; } }
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· PowerShell开发游戏 · 打蜜蜂
· 在鹅厂做java开发是什么体验
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战
2023-04-15 heatmapts_simple-heatmap的使用