20180301越努力越轻松

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
'目前存在的BUG
'图片补丁存在多个URL
'题目中间存在小数的问题在正则表达式里加上\d+\D
Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
    Dim lngRetVal As Long
    lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
    If lngRetVal = 0 Then
        DeleteUrlCacheEntry ImageURL  '清除缓存
        'MsgBox "成功"
    Else
        'MsgBox "失败"
    End If
End Sub
Sub LoopGetSubject()
    Dim StartTime As Variant
    Dim UsedTime As Variant
    StartTime = VBA.Timer
     
    Dim msg As Variant
    msg = MsgBox("Choose 'Yes' to Continue,Choose 'No' to Exit !", vbYesNo, "AuthorQQ 84857038")
    If msg = vbNo Then Exit Sub
     
    Dim Sht As Worksheet
    Set Sht = ThisWorkbook.ActiveSheet
    With Sht
        EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
        For i = 2 To EndRow
            SetFontRed .Cells(i, 1).Resize(1, 3)
            FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
            ExamUrl = .Cells(i, 2).Text
            Source = .Cells(i, 1).Text
            Call GetExamTextByUrl(ExamUrl, FindText, Source)
        Next i
    End With
    Set Sht = Nothing
    UsedTime = VBA.Timer - StartTime
    Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
    MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
End Sub
 
Sub ConditionGetSubject()
    Dim StartTime As Variant
    Dim UsedTime As Variant
    StartTime = VBA.Timer
     
    Dim msg As Variant
     
     
    Text = Application.InputBox("请输入筛选关键词,支持LIKE方法的通配符与|分支: ", "AuthorQQ 84857038", , , , , , 2)
     
    If Text = False Then
        msg = MsgBox("本次执行等同于提取所有题目,是否继续?,Choose 'No' to Exit !", vbYesNo, "AuthorQQ 84857038")
        If msg = vbNo Then Exit Sub
    End If
     
    'Condition = "*" & Text & "*"
     
    Dim Sht As Worksheet
    Set Sht = ThisWorkbook.ActiveSheet
    With Sht
        EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
        For i = 2 To EndRow
            If InStr(Text, "|") = 0 Then
                Condition = "*" & Text & "*"
                If .Cells(i, 3).Text Like Condition Then
                    SetFontRed .Cells(i, 1).Resize(1, 3)
                    FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                    ExamUrl = .Cells(i, 2).Text
                    Source = .Cells(i, 1).Text
                    Call GetExamTextByUrl(ExamUrl, FindText, Source)
                End If
            Else
                conditions = Split(Text, "|")
                For n = LBound(conditions) To UBound(conditions) Step 1
                    Condition = "*" & conditions(n) & "*"
                    If .Cells(i, 3).Text Like Condition Then
                        SetFontRed .Cells(i, 1).Resize(1, 3)
                        FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                        ExamUrl = .Cells(i, 2).Text
                        Source = .Cells(i, 1).Text
                        Call GetExamTextByUrl(ExamUrl, FindText, Source)
                    End If
                Next n
            End If
        Next i
         
         
    End With
    Set Sht = Nothing
    UsedTime = VBA.Timer - StartTime
    Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
    MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
End Sub
 
 
 
Sub GetSubject()
    Dim Rng As Range
    Dim OneCell As Range
    Set Rng = Application.Selection
    For Each OneCell In Rng.Cells
        If OneCell.Column = 3 Then
            If Len(OneCell.Text) > 0 Then
                SetFontRed OneCell
                FindText = Mid(OneCell.Text, 4, Len(OneCell.Text) - 8)
                ExamUrl = OneCell.Offset(0, -1).Text
                Source = OneCell.Offset(0, -2).Text
                Call GetExamTextByUrl(ExamUrl, FindText, Source)
            End If
        End If
    Next OneCell
End Sub
Sub GetExamTextByUrl(ByVal ExamUrl As String, ByVal FindText As String, ByVal Source As String)
    Dim Subject As String
    Dim HasImageText As String
    Dim Question As String
    Dim ImageURL As String
    Dim Answer As String
    Dim HasGetContent As Boolean
    Dim docName As String
    Dim docPath As String
    Dim Independent As Boolean
    Dim IsQuestion As Boolean
    Dim IsAnswer As Boolean
    Dim oneP As Object
    Dim nextTag As Object
     
    'send request
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", ExamUrl, False
        .Send
        WebText = .responsetext
        'Debug.Print WebText
        ' Stop
    End With
    With CreateObject("htmlfile")
        .write WebText
        Set examdiv = .getElementById("sina_keyword_ad_area2")
        '获取试卷文本内容
        ExamText = examdiv.innerText
         
        '判断试卷是否含有独立答案
        Independent = ExamText Like "*参考答案*"
        'Debug.Print "  Independent "; Independent
        '设定搜集题目Word文档名称和路径
        docName = Application.ActiveSheet.Name & "_题目搜集.doc"
        docPath = ThisWorkbook.Path & "\" & docName
        '判断某个段落是否为题目/答案的开始
        IsQuestion = False
        IsAnswer = False
        '判断是否已经提取到内容
        HasGetContent = False
        '循环所有段落
        For Each oneP In .getElementsByTagName("p")
            If HasGetContent = False Then
                '判断某段内容是否为题号行
                'If oneP.innerText Like "##.*" Or oneP.innerText Like "##.*" Then
                If RegTest(oneP.innerText, "(\d{1,2})[\..]\D.*") Then
                    '保留题干源码 并去除掉标题title的内容,提取出汉字作为分隔符
                    pInnerHtml = oneP.innerHTML
                    pInnerHtml = Replace(pInnerHtml, Source, "")
                    HeadSp = RegGet(pInnerHtml, "([\u4e00-\u9fa5]{5,})")
                     
                    Subject = ""
                    Question = ""
                    ImageURL = ""
                    Answer = ""
                    '开始记录题干内容
                    Subject = oneP.innerText
                    'Debug.Print OneP.innerText
                Else
                    If InStr(oneP.innerText, FindText) = 0 Then
                        '过滤不相干的问题,仅保留符合条件的问题
                        If Not RegTest(oneP.innerText, "([\((]\d[\))]).*") Then
                            '继续记录问题内容
                            Subject = Subject & vbCrLf & oneP.innerText
                        End If
                    End If
                End If
                 
                '提取题目的序号和问题的序号
                If InStr(oneP.innerText, FindText) > 0 Then
                    '保留问题的源码,删除掉标题Title的内容 并提取出汉字 作为分隔符
                    questionHtml = oneP.innerHTML
                    questionHtml = Replace(questionHtml, Source, "")
                    TailSp = RegGet(questionHtml, "([\u4e00-\u9fa5]{5,})")
                      
                    'Debug.Print ">>>>>汉字分隔符>>>>"; HeadSp
                    'Debug.Print ">>>>>查找>>>>" & FindText; InStr(WebText, TailSp) > 0
                    HasImageText = Split(WebText, TailSp)(0)
                    pos = InStrRev(HasImageText, HeadSp)
                    HasImageText = Mid(HasImageText, pos)
                     
                    SubjectIndex = RegGet(Subject, "(\d{1,2})[..]\D.*")
                    Question = oneP.innerText
                    questionIndex = RegGet(Question, "[\((](\d)[\))].*")
                    'Debug.Print "题序:"; SubjectIndex; "   问序: "; questionIndex
                    HasGetContent = True
                End If
                 
            Else
                '提取内容后 开始找答案
                '试卷不含独立答案,答案就附在每道题后面
                If Independent = False Then
                     
                    If IsAnswer = False Then
                        If RegTest(oneP.innerText, "[\((](" & questionIndex & ")[\))].*") Then
                            Answer = oneP.innerText
                            IsAnswer = True
                            'Exit For
                        End If
                    Else
                        Debug.Print oneP.innerText
                        If RegTest(oneP.innerText, "[\((](\d)[\))].*") Or RegTest(oneP.innerText, "(\d{1,2})[..]\D.*") Then
                            Exit For
                        Else
                            Answer = Answer & oneP.innerText
                        End If
                    End If
                Else
                    '试卷还有独立参考答案
                    '判断某段内容的题号是否符合条件
                    If RegTest(oneP.innerText, "(" & SubjectIndex & ")[\..].\D*") Then
                        IsQuestion = True
                        'Debug.Print isQuestion
                    End If
                    If IsQuestion = True Then
                        '判断某段内容的问题序号是否符合条件
                        If IsAnswer = False Then
                            If RegTest(oneP.innerText, "([\((]" & questionIndex & "[\))]).*") Then
                                '记录问题答案
                                Answer = oneP.innerText
                                IsAnswer = True
                                'Exit For
                            End If
                        Else
                            Debug.Print oneP.innerText
                            If RegTest(oneP.innerText, "[\((](\d)[\))].*") Or RegTest(oneP.innerText, "(\d{1,2})[..]\D.*") Then
                                Exit For
                            Else
                                Answer = Answer & oneP.innerText
                            End If
                        End If
                    End If
                End If
            End If
        Next oneP
        '图片地址处理
        ' ImageURL = Mid(ImageURL, 2)
        '测试
         
        'Debug.Print ImageURL
        Debug.Print Question
        Debug.Print Answer
    End With
    '<span style="font-family:">43.</span>
    '【补丁,有待改进】2017年下半年部分图片提取不到的问题修正
    ImageURL = ""
    If Len(ImageURL) = 0 Then
     
        imgs = RegGetArray(HasImageText, "real_src =""(http.*?)""")
         
        For n = LBound(imgs) To UBound(imgs) Step 1
            'Debug.Print imgs(n)
            ImageURL = ImageURL & "|" & imgs(n)
        Next n
         
        'Stop
        ImageURL = Mid(ImageURL, 2)
        Debug.Print "所有图片地址:"; ImageURL
        'Stop
        'hasimagetext = Split(hasimagetext, "real_src")(UBound(Split(hasimagetext, "real_src")))
        'ImageURL = Split(hasimagetext, """")(1)
    End If
     
    '输出题目内容到Word文档
    Dim wdApp As Object
    Dim Doc As Object
     
    On Error Resume Next
    Set wdApp = GetObject(, "Word.Application")
    On Error GoTo 0
    If Not wdApp Is Nothing Then
        wdApp.Visible = True
        On Error Resume Next
        Set Doc = wdApp.documents(docName)
        On Error GoTo 0
        If Doc Is Nothing Then
            Set Doc = wdApp.documents.Add()
            Doc.SaveAs docPath
        End If
    Else
        Set wdApp = CreateObject("Word.Application")
        wdApp.Visible = True
         
        If Dir(docPath) <> "" Then
            Set Doc = wdApp.documents.Open(docPath)
        Else
            Set Doc = wdApp.documents.Add()
            Doc.SaveAs docPath
        End If
    End If
     
    Doc.Activate
    wdApp.Selection.EndKey 6
    wdApp.Selection.TypeParagraph
    wdApp.Selection.InsertBreak 7
    '输出题干内容
    'Debug.Print Subject
    Subject = RegReplace(Subject, "(" & SubjectIndex & "[\..])") & "."
    'Debug.Print Subject
    'Stop
    wdApp.Selection.TypeText Text:=Subject
    wdApp.Selection.TypeParagraph
     
    '下载图片并插入WORD文档
    If ImageURL <> "" Then
        If InStr(ImageURL, "|") = 0 Then
            ImagePath = ThisWorkbook.Path & Application.PathSeparator & "tmp.jpg"
            DownloadImageName ImageURL, ImagePath
            wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
            wdApp.Selection.TypeParagraph
            Kill ImagePath
            'Stop'
        Else
            ImageURLs = Split(ImageURL, "|")
            For n = LBound(ImageURLs) To UBound(ImageURLs) Step 1
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & n & "tmp.jpg"
                DownloadImageName ImageURLs(n), ImagePath
                Debug.Print ImagePath
                wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                wdApp.Selection.TypeParagraph
                Kill ImagePath
            Next n
        End If
    End If
    '输出问题内容
    wdApp.Selection.TypeText Text:=RegReplace(Question, "([\((]\d[\))])")
    wdApp.Selection.TypeParagraph
    '输出答案内容
     
    sp = RegGet(Answer, "([\((]" & questionIndex & "[\))]).*")
    'Debug.Print Sp
    If Len(sp) > 0 Then
        Answer = Split(Answer, sp)(1)
        sp = RegGet(Answer, "([\((]" & questionIndex + 1 & "[\))]).*")
        If Len(sp) > 0 Then
            Answer = Split(Answer, sp)(0)
        End If
    End If
    'Debug.Print Answer
    Answer = RegReplace(Answer, "(【来源】.*)")
    Answer = RegReplace(Answer, "(【解析】.*)")
    'Debug.Print Answer
    'Stop
    wdApp.Selection.TypeText Text:="【答案】" & Answer
     
     
    Source = Replace(Source, "【", "")
    Source = Replace(Source, "】", "")
    Source = Replace(Source, "解析", "")
     
    wdApp.Selection.TypeParagraph
    wdApp.Selection.TypeText Text:="[ 来源:" & Source & " 第" & SubjectIndex & "题 第(" & questionIndex & ")问 ]"
    wdApp.Selection.TypeParagraph
     
    Set wdApp = Nothing
    Set Doc = Nothing
    Set oneP = Nothing
End Sub
Private Function RegTest(ByVal OrgText As String, ByVal Pattern As String) As Boolean
'传递参数 :原字符串, 匹配模式
    Dim Regex As Object
    Set Regex = CreateObject("VBScript.RegExp")
    With Regex
        .Global = True
        .Pattern = Pattern
    End With
    RegTest = Regex.test(OrgText)
    Set Regex = Nothing
End Function
Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
    Dim Regex As Object
    Dim Mh As Object
    Set Regex = CreateObject("VBScript.RegExp")
    With Regex
        .Global = True
        .Pattern = Pattern
    End With
    If Regex.test(OrgText) Then
        Set Mh = Regex.Execute(OrgText)
        RegGet = Mh.Item(0).submatches(0)
    Else
        RegGet = ""
    End If
    Set Regex = Nothing
End Function
Sub SetFontRed(ByVal Rng As Range)
    With Rng.Font
        .Color = -16776961
        .TintAndShade = 0
    End With
End Sub
Public Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
'传递参数 :原字符串, 匹配模式 ,替换字符
    Dim Regex As Object
    Dim newText As String
    Set Regex = CreateObject("VBScript.RegExp")
    With Regex
        .Global = True
        .Pattern = Pattern
    End With
    newText = Regex.Replace(OrgText, RepStr)
    RegReplace = newText
    Set Regex = Nothing
End Function
Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
    Dim Reg As Object, Mh As Object, OneMh As Object
    Dim Arr() As String, Index As Long
    Dim Elm As String
    Set Reg = CreateObject("Vbscript.Regexp")
    With Reg
        .MultiLine = True
        .Global = True
        .Ignorecase = False
        .Pattern = Pattern
        Set Mh = .Execute(OrgText)
         
        Index = 0
        ReDim Arr(1 To 1)
        For Each OneMh In Mh
            Index = Index + 1
            ReDim Preserve Arr(1 To Index)
            'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
            Arr(Index) = OneMh.submatches(0)
            'Debug.Print OneMh.submatches(0)
        Next OneMh
    End With
    RegGetArray = Arr
    Set Reg = Nothing
    Set Mh = Nothing
End Function
 
Function RealInnerHtml(ByVal OrgInnerHtml) As String
      Dim x As String
      x = OrgInnerHtml
      x = Replace(x, "SPAN", "span")
      x = Replace(x, "FONT-SIZE", "font-size")
      x = Replace(x, "FONT-FAMILY", "font-family")
      x = Replace(x, "FONT", "font")
      x = Replace(x, "WBR", "wbr")
      x = Replace(x, "COLOR", "color")
      RealInnerHtml = x
End Function
Public Function RegGetLast(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
    Dim Regex As Object
    Dim Mh As Object
    Set Regex = CreateObject("VBScript.RegExp")
    With Regex
        .Global = True
        .Pattern = Pattern
    End With
    If Regex.test(OrgText) Then
        Set Mh = Regex.Execute(OrgText)
        'RegGetLast = Mh.Item(0).submatches(0)
        For Each OneMh In Mh
            RegGetLast = OneMh.submatches(0)
        Next OneMh
    Else
        RegGetLast = ""
    End If
    Set Regex = Nothing
End Function

  

posted @   wangway  阅读(279)  评论(0编辑  收藏  举报
编辑推荐:
· 智能桌面机器人:用.NET IoT库控制舵机并多方法播放表情
· Linux glibc自带哈希表的用例及性能测试
· 深入理解 Mybatis 分库分表执行原理
· 如何打造一个高并发系统?
· .NET Core GC压缩(compact_phase)底层原理浅谈
阅读排行:
· 手把手教你在本地部署DeepSeek R1,搭建web-ui ,建议收藏!
· 新年开篇:在本地部署DeepSeek大模型实现联网增强的AI应用
· Janus Pro:DeepSeek 开源革新,多模态 AI 的未来
· 互联网不景气了那就玩玩嵌入式吧,用纯.NET开发并制作一个智能桌面机器人(三):用.NET IoT库
· 【非技术】说说2024年我都干了些啥
点击右上角即可分享
微信分享提示