获取标题

Sub GetCatalogPages()
      For n = 1 To 20
            CatalogURL = "http://blog.sina.com.cn/s/_" & n & ".html"
           Call GetCatalogByUrl(CatalogURL)
      Next n
End Sub
Sub GetCatalogByUrl(ByVal CatalogURL As String)
    'Dim CatalogURL As String
    Dim WebText As String
    Dim OneSpan As Object
    Dim OneA As Object
    Dim Wb As Workbook
    Dim Sht As Worksheet
    Dim Rng As Range
    Dim i As Long, j As Long
    
    Dim StartTime As Variant    '开始时间
    Dim UsedTime As Variant    '使用时间
    StartTime = VBA.Timer    '记录开始时间
    
    AppSettings

    
    Set Wb = Application.ThisWorkbook
    Set Sht = Wb.Worksheets("Catalog")
    With Sht
        '.UsedRange.Offset(1).ClearContents
        'i = 1
      endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
      i = endrow
        '发送请求
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", CatalogURL, False
            .Send
            WebText = .responsetext
        End With
        '创建网页文件 创建 Html Dom
        'Microsoft HTML Object Library
        With CreateObject("htmlfile")
            .write WebText
            For Each OneA In .getElementsByTagName("a")
                href = OneA.href
                If href Like "*http://blog.sina.com.cn/s/blog_*" Then
                    i = i + 1
                    Sht.Cells(i, 2).Value = href
                   '     Sht.Hyperlinks.Add Sht.Cells(i, 2), href ', href
                End If
            Next OneA
             i = endrow
            For Each OneMeta In .getElementsByTagName("meta")
                If OneMeta.Name = "description" Then
                    cnt = OneMeta.Content
                    'Debug.Print cnt
                    titles = Split(Split(cnt, "xxxx,")(1), ",")
                    For n = LBound(titles) To UBound(titles) Step 1
                        i = i + 1
                        Sht.Cells(i, 1).Value = titles(n)
                    Next n
                End If
            Next OneMeta
        End With
    End With
    AppSettings False
    UsedTime = VBA.Timer - StartTime
    Debug.Print "采集     " & CatalogURL; " :  " & Format(UsedTime, "#0.0000秒")
    'MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
End Sub
Sub GetQuestionsByExamUrl()
    
    Dim Wb As Workbook
    Dim Sht As Worksheet
    Dim oSht As Worksheet
Set Wb = Application.ThisWorkbook
    
    Set Sht = Wb.Worksheets("Catalog")
    Set oSht = Wb.Worksheets("Question")
    
    With Sht
        endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
        Set Rng = .Range("A2:B" & endrow)
        Arr = Rng.Value
    End With
    With oSht
        r = 1
        For i = LBound(Arr) To UBound(Arr)
            ExamTitle = Arr(i, 1)
            ExamUrl = Arr(i, 2)
            ExamText = GetExamTextByUrl(ExamUrl)
            Ques = RegGetArray(ExamText, "([\((]\d[\))][^\r\n]*)[\r\n]")
            For n = LBound(Ques) To UBound(Ques) Step 1
                r = r + 1
                .Cells(r, 1).Value = ExamTitle
                .Cells(r, 2).Value = ExamUrl
                .Cells(r, 3).Value = Ques(n)
            Next n
            
        Next i
    End With
    
    
    
    Set Wb = Nothing
    Set Sht = Nothing
    Set oSht = Nothing
    
End Sub

Function GetExamTextByUrl(ByVal ExamUrl As String) As String
       '发送请求
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", ExamUrl, False
            .Send
            WebText = .responsetext
            'Debug.Print WebText
        End With
        With CreateObject("htmlfile")
            .write WebText
           Set examdiv = .getElementById("sina_keyword_ad_area2")
           ' Debug.Print examdiv.innerText
          GetExamTextByUrl = examdiv.innerText
        End With
End Function
Private Sub AppSettings(Optional IsStart As Boolean = True)
    If IsStart Then
        Application.ScreenUpdating = False
        Application.DisplayAlerts = False
        Application.Calculation = xlCalculationManual
        Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>"
    Else
        Application.ScreenUpdating = True
        Application.DisplayAlerts = True
        Application.Calculation = xlCalculationAutomatic
        Application.StatusBar = False
    End If
End Sub

Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
    Dim Reg As Object, Mh As Object, OneMh As Object
    Dim Arr() As String, Index As Long
    Dim Elm As String
    Set Reg = CreateObject("Vbscript.Regexp")
    With Reg
        .MultiLine = True
        .Global = True
        .Ignorecase = False
        .Pattern = Pattern
        Set Mh = .Execute(OrgText)
        
        Index = 0
        ReDim Arr(1 To 1)
        For Each OneMh In Mh
            Index = Index + 1
            ReDim Preserve Arr(1 To Index)
            'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
            Arr(Index) = OneMh.submatches(0)
            'Debug.Print OneMh.submatches(0)
        Next OneMh
    End With
    RegGetArray = Arr
    Set Reg = Nothing
    Set Mh = Nothing
End Function


Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
    Dim Regex As Object
    Dim Mh As Object
    Set Regex = CreateObject("VBScript.RegExp")
    With Regex
        .Global = True
        .Pattern = Pattern
    End With
    If Regex.test(OrgText) Then
        Set Mh = Regex.Execute(OrgText)
        RegGet = Mh.Item(0).submatches(0)
    Else
        RegGet = ""
    End If
    Set Regex = Nothing
End Function

  

posted @ 2018-02-09 22:44  wangway  阅读(237)  评论(0编辑  收藏  举报