20170601xlVBA正则表达式提取体检数据
Public Sub GetFirst() GetDataFromWord "初检" End Sub Public Sub GetDataFromWord(ByVal SheetName As String) AppSettings 'On Error GoTo ErrHandler Dim StartTime, UsedTime As Variant StartTime = VBA.Timer 'Input code here Dim Wb As Workbook Dim Sht As Worksheet Dim oSht As Worksheet Dim Rng As Range Dim Arr As Variant Dim wdApp As Word.Application Dim wdDoc As Word.Document Dim wdRng As Word.Range 'Const SHEET_NAME As String = "提取信息" Set Wb = Application.ThisWorkbook Set Sht = Wb.Worksheets(SheetName) Dim FilePath As String With Application.FileDialog(msoFileDialogFilePicker) .AllowMultiSelect = False .InitialFileName = Wb.Path .Title = "提取" & SheetName & "数据" .Filters.Clear .Filters.Add "Word文档", "*.rtf*" If .Show = -1 Then FilePath = .SelectedItems(1) Else MsgBox "您没有选中任何文件夹,本次汇总中断!" Exit Sub End If End With Debug.Print FilePath Set wdApp = New Word.Application Set wdDoc = wdApp.Documents.Open(FilePath) Application.StatusBar = ">>>>>>>>Positioning & Replacing >>>>>>>>" PositioningClear wdDoc, 5 '定位删除英文行 避免正则提取造成干扰 Application.StatusBar = ">>>>>>>>Regexpress Getting array >>>>>>>>" Arr = RegGetArray(wdDoc.Content.Text) '正则从全文提取内容 存入数组 wdDoc.Close False '关闭doc wdApp.Quit '退出app Set wdApp = Nothing Set wdDoc = Nothing With Sht .Cells.Clear .Range("A1:D1").Value = Array("大项", "小项", "D值", "E值") Set Rng = .Range("A2").Resize(UBound(Arr, 2), UBound(Arr)) Rng.Value = Application.WorksheetFunction.Transpose(Arr) Sort2003 .UsedRange End With UsedTime = VBA.Timer - StartTime Debug.Print "UsedTime:" & Format(UsedTime, "0.000 Seconds") 'MsgBox "UsedTime:" & Format(UsedTime, "0.000 Seconds"), vbOKOnly, "NextSeven QQ " ErrorExit: Set Wb = Nothing Set Sht = Nothing Set Rng = Nothing AppSettings False On Error Resume Next wdApp.Quit Exit Sub ErrHandler: If Err.Number <> 0 Then MsgBox Err.Description & "!", vbCritical, "NextSeven QQ " Debug.Print Err.Description Err.Clear Resume ErrorExit End If End Sub Public Sub AppSettings(Optional IsStart As Boolean = True) If IsStart Then Application.ScreenUpdating = False Application.DisplayAlerts = False Application.Calculation = xlCalculationManual Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>" Else Application.ScreenUpdating = True Application.DisplayAlerts = True Application.Calculation = xlCalculationAutomatic Application.StatusBar = False End If End Sub Function RegGetArray(ByVal OrgText As String) As String() Dim Reg As Object, Mh As Object, OneMh As Object Dim Reg2 As Object Dim Arr() As String, Index As Long Dim Elm As String Set Reg = CreateObject("Vbscript.Regexp") Set Reg2 = CreateObject("Vbscript.Regexp") Reg2.Global = True With Reg 'OrgText = Application.ActiveDocument.Content .MultiLine = True .Global = True .Ignorecase = False '可用 '.Pattern = "(?:\s)?(\S*?)?\s? *" & "(?:[ ])([^ ][^\r\n\v]*?)\s+?(D=[\d\.]+)\s+(E=[\d\.]+)[\s]+?" .Pattern = "(?:\s+?)([一-龥;,,]*?)?\s? *" & "(?:[ ])([^ ][^\r\n\v]*?)\s+?(D=[\d\.]+)\s+(E=[\d\.]+)[\s]+?" Set Mh = .Execute(OrgText) Index = 0 ReDim Arr(1 To 4, 1 To 1) For Each OneMh In Mh Index = Index + 1 ReDim Preserve Arr(1 To 4, 1 To Index) If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0) Reg2.Pattern = "[;,,]?(左视图|前视图|纵切面)+[;,,]?" Arr(1, Index) = Reg2.Replace(Elm, "") Reg2.Pattern = "[\s#G]" Arr(2, Index) = Reg2.Replace(OneMh.submatches(1), "") 'Debug.Print OneMh.submatches(2) Arr(3, Index) = Split(OneMh.submatches(2), "=")(1) 'Debug.Print OneMh.submatches(3) Arr(4, Index) = Split(OneMh.submatches(3), "=")(1) Next OneMh End With RegGetArray = Arr Set Reg = Nothing: Set Mh = Nothing Set Reg2 = Nothing End Function Public Sub PositioningClear(ByVal OpenDoc As Word.Document, ByVal Times As Long) Dim wdRng As Word.Range Dim lngStart As Long Dim lngEnd As Long Dim lngTime As Long For lngTime = 1 To Times lngEnd = OpenDoc.Content.End With OpenDoc.Content.Find .ClearFormatting .Replacement.ClearFormatting .Text = "ALIMENTARY SYSTEM" .Replacement.Text = "" If .Execute Then lngStart = .Parent.Start Set wdRng = OpenDoc.Range(lngStart, lngEnd) End If End With If Not wdRng Is Nothing Then With wdRng.Find .ClearFormatting .Replacement.ClearFormatting .Text = "[^l^13][A-Za-z0-9\- ,;:.]@[^l^13]" .MatchWildcards = True .Wrap = wdFindStop .Forward = True .Replacement.Text = "^l" 'n = 0 .Execute Replace:=wdReplaceAll 'Do While .Execute ' n = n + 1 ' Debug.Print n; "____________"; .Parent.Text ' If n > 1000 Then Exit Do 'Loop End With End If Set wdRng = Nothing Next lngTime End Sub Sub Sort2003(ByVal RngWithTitle As Range, Optional SortColumnNo As Long = 1) 'key1代表第一个排序的列的关键字 'Order1表示第一字段的排序方式,赋值为xlAscending表示升序,改为xlDescending表示降序。 'Header表示是否包含标题,赋值为xlYes表示标题不参与排序,赋值为xlNo表示标题也参数排序 'MatchCase表示排序时是否区分大小写,赋值为False表示不区分大小写 'Orientation表示排序方向,赋值为xlTopToBottom或者xlSortColumns表示按列排序,赋值为xlSortRows 表示排行排序 'SortMethod用于限制对汉字排序时的排序方式,赋值为xlPinYin表示按拼音排序,赋值为xlStroke表示按笔划排序 With RngWithTitle .Sort Key1:=RngWithTitle.Cells(1, SortColumnNo), Order1:=xlAscending, Header:=xlYes, _ MatchCase:=False, Orientation:=xlTopToBottom, SortMethod:=xlPinYin End With End Sub