获取标题
Sub GetCatalogPages() For n = 1 To 20 CatalogURL = "http://blog.sina.com.cn/s/_" & n & ".html" Call GetCatalogByUrl(CatalogURL) Next n End Sub Sub GetCatalogByUrl(ByVal CatalogURL As String) 'Dim CatalogURL As String Dim WebText As String Dim OneSpan As Object Dim OneA As Object Dim Wb As Workbook Dim Sht As Worksheet Dim Rng As Range Dim i As Long, j As Long Dim StartTime As Variant '开始时间 Dim UsedTime As Variant '使用时间 StartTime = VBA.Timer '记录开始时间 AppSettings Set Wb = Application.ThisWorkbook Set Sht = Wb.Worksheets("Catalog") With Sht '.UsedRange.Offset(1).ClearContents 'i = 1 endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row i = endrow '发送请求 With CreateObject("MSXML2.XMLHTTP") .Open "GET", CatalogURL, False .Send WebText = .responsetext End With '创建网页文件 创建 Html Dom 'Microsoft HTML Object Library With CreateObject("htmlfile") .write WebText For Each OneA In .getElementsByTagName("a") href = OneA.href If href Like "*http://blog.sina.com.cn/s/blog_*" Then i = i + 1 Sht.Cells(i, 2).Value = href ' Sht.Hyperlinks.Add Sht.Cells(i, 2), href ', href End If Next OneA i = endrow For Each OneMeta In .getElementsByTagName("meta") If OneMeta.Name = "description" Then cnt = OneMeta.Content 'Debug.Print cnt titles = Split(Split(cnt, "xxxx,")(1), ",") For n = LBound(titles) To UBound(titles) Step 1 i = i + 1 Sht.Cells(i, 1).Value = titles(n) Next n End If Next OneMeta End With End With AppSettings False UsedTime = VBA.Timer - StartTime Debug.Print "采集 " & CatalogURL; " : " & Format(UsedTime, "#0.0000秒") 'MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒") End Sub Sub GetQuestionsByExamUrl() Dim Wb As Workbook Dim Sht As Worksheet Dim oSht As Worksheet Set Wb = Application.ThisWorkbook Set Sht = Wb.Worksheets("Catalog") Set oSht = Wb.Worksheets("Question") With Sht endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row Set Rng = .Range("A2:B" & endrow) Arr = Rng.Value End With With oSht r = 1 For i = LBound(Arr) To UBound(Arr) ExamTitle = Arr(i, 1) ExamUrl = Arr(i, 2) ExamText = GetExamTextByUrl(ExamUrl) Ques = RegGetArray(ExamText, "([\((]\d[\))][^\r\n]*)[\r\n]") For n = LBound(Ques) To UBound(Ques) Step 1 r = r + 1 .Cells(r, 1).Value = ExamTitle .Cells(r, 2).Value = ExamUrl .Cells(r, 3).Value = Ques(n) Next n Next i End With Set Wb = Nothing Set Sht = Nothing Set oSht = Nothing End Sub Function GetExamTextByUrl(ByVal ExamUrl As String) As String '发送请求 With CreateObject("MSXML2.XMLHTTP") .Open "GET", ExamUrl, False .Send WebText = .responsetext 'Debug.Print WebText End With With CreateObject("htmlfile") .write WebText Set examdiv = .getElementById("sina_keyword_ad_area2") ' Debug.Print examdiv.innerText GetExamTextByUrl = examdiv.innerText End With End Function Private Sub AppSettings(Optional IsStart As Boolean = True) If IsStart Then Application.ScreenUpdating = False Application.DisplayAlerts = False Application.Calculation = xlCalculationManual Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>" Else Application.ScreenUpdating = True Application.DisplayAlerts = True Application.Calculation = xlCalculationAutomatic Application.StatusBar = False End If End Sub Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String() Dim Reg As Object, Mh As Object, OneMh As Object Dim Arr() As String, Index As Long Dim Elm As String Set Reg = CreateObject("Vbscript.Regexp") With Reg .MultiLine = True .Global = True .Ignorecase = False .Pattern = Pattern Set Mh = .Execute(OrgText) Index = 0 ReDim Arr(1 To 1) For Each OneMh In Mh Index = Index + 1 ReDim Preserve Arr(1 To Index) 'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0) Arr(Index) = OneMh.submatches(0) 'Debug.Print OneMh.submatches(0) Next OneMh End With RegGetArray = Arr Set Reg = Nothing Set Mh = Nothing End Function Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String '传递参数 :原字符串, 匹配模式 Dim Regex As Object Dim Mh As Object Set Regex = CreateObject("VBScript.RegExp") With Regex .Global = True .Pattern = Pattern End With If Regex.test(OrgText) Then Set Mh = Regex.Execute(OrgText) RegGet = Mh.Item(0).submatches(0) Else RegGet = "" End If Set Regex = Nothing End Function