试卷

Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
Public Sub Crawler()
    Dim StartTime As Variant '开始时间
    Dim UsedTime As Variant '使用时间
    StartTime = VBA.Timer '记录开始时间
    
    With Sheets("试卷URL")
        i = 2
        Do While .Cells(i, 1).Value <> ""
            OneKeyCreateExam .Cells(i, 2).Text
            i = i + 1
            If i = 1000 Then Exit Do
        Loop
    End With
    
    UsedTime = VBA.Timer - StartTime
    MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
    
End Sub
'下载网络图片
Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
    Dim lngRetVal As Long
    lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
    If lngRetVal = 0 Then
        DeleteUrlCacheEntry ImageURL  '清除缓存
        'MsgBox "成功"
    Else
        'MsgBox "失败"
    End If
End Sub

Sub OneKeyCreateExam(ByVal URL As String)
    Dim ImgNames As Variant
    Dim strText As String
    Dim i As Long, n As Long, m As Long
    Dim OneTagP As Object
    Dim OneTagA As Object
    Dim TagP As Object
    Dim PosText As String
    Dim Arr() As String
    ReDim Arr(1 To 1) As String
    Dim Brr() As String
    ReDim Brr(1 To 1)
    Dim ImageURL As String
    Dim FilePath As String
    Dim FileName As String
    
    Dim dContent As Object
    Set dContent = CreateObject("Scripting.Dictionary")
    Dim dImageName As Object
    Set dImageName = CreateObject("Scripting.Dictionary")
    
    Dim StartTime As Variant    '开始时间
    Dim UsedTime As Variant    '使用时间
    StartTime = VBA.Timer    '记录开始时间
    
    AppSettings
   On Error GoTo ErrHandler
    'Debug.Print URL
    '设置URL,访问网页获取网页源码
    'URL = ActiveSheet.Range("A2").Text    '"http://blog.sina.com.cn/s/blog_5a18c50f0102x8lg.html"
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .Send
        strText = .responsetext
    End With
    
    '创建网页文件
    With CreateObject("htmlfile")
        .write strText
        
        '获取标题
        FileName = .getElementsByTagName("h2")(0).innerhtml
        Debug.Print "开始正在采集>>>>>>"; FileName; "   网络地址 :" & URL
        
        Application.StatusBar = ">>>>>>正在下载图片>>>>>>"
        
        i = 0    '初始化序号
        
        For Each OneTagA In .getElementsByTagName("a")
            '循环所有A标签
            If OneTagA.HasChildNodes Then
                If OneTagA.href Like "http://photo.blog.sina.com.cn/showpic.html*" Then
                    
                    
                    If OneTagA.ParentNode.tagname = "DIV" Then
                        Set TagP = OneTagA.PreviousSibling
                        Do While TagP.tagname <> "P"
                            Set TagP = TagP.PreviousSibling
                        Loop
                        i = i + 1
                        '文字内容提取
                        PosText = TagP.innerhtml
                        PosText = RegReplace(PosText, "<.*?>")
                        PosText = Replace(PosText, " ", "")
                        
                        '获取图片URL
                        ImageURL = OneTagA.FirstChild.getAttribute("real_src")
                        ImageName = "Image" & i & ".jpg"
                        ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                        DownloadImageName ImageURL, ImagePath    '下载图片
                        
                        '获取图片
                        If dImageName.Exists(PosText) = False Then
                            dImageName(PosText) = ImageName
                        Else
                            dImageName(PosText) = dImageName(PosText) & "|" & ImageName
                        End If
                    ElseIf OneTagA.ParentNode.tagname = "P" Then
                        'On Error Resume Next
                        Set TagP = OneTagA.ParentNode.PreviousSibling
                        
                        i = i + 1
                        '文字内容提取
                        PosText = TagP.innerhtml
                        PosText = RegReplace(PosText, "<.*?>")
                        PosText = Replace(PosText, " ", "")
                        
                        '获取图片URL
                        ImageURL = OneTagA.FirstChild.getAttribute("real_src")
                        ImageName = "Image" & i & ".jpg"
                        ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                        DownloadImageName ImageURL, ImagePath    '下载图片
                        
                        '获取图片
                        If dImageName.Exists(PosText) = False Then
                            dImageName(PosText) = ImageName
                        Else
                            dImageName(PosText) = dImageName(PosText) & "|" & ImageName
                        End If
                        
                        
                    End If
                End If
            End If
        Next OneTagA
        
        Application.StatusBar = ">>>>>>正在获取文本>>>>>>"
        
        Dim IsContent As Boolean
        i = 0    '初始化序号
        n = 0    '初始化序号
        For Each OneTagP In .getElementsByTagName("p")
            '文字内容提取
            
            PosText = OneTagP.innerhtml
            PosText = RegReplace(PosText, "<.*?>")
            PosText = Replace(PosText, " ", "")
            
            i = i + 1
            If (InStr(PosText, "地理") > 0 And i > 15) Or i > 20 Then IsContent = True
            If PosText = "喜欢" Then Exit For    '提前结束循环
            
            If IsContent Then
                If Len(PosText) > 0 Then
                    n = n + 1
                    ReDim Preserve Arr(1 To n) '保留非空数组  '开始记录试卷内容
                    Arr(n) = PosText    '存入数组
                    '   Debug.Print n; "               "; PosText
                    'dContent(PosText) = n
                End If
            End If
        Next
        
        
    End With
    
    
    
    Application.StatusBar = ">>>>>>正在创建Word文档>>>>>>"
    
    FilePath = ThisWorkbook.Path & "" & FileName & ".doc"
    On Error Resume Next
    Kill FilePath
    On Error GoTo 0
    
    Dim wdApp As Object
    Dim Doc As Object
    Set wdApp = CreateObject("Word.Application")
    Set Doc = wdApp.documents.Add()
    
    Doc.Activate
    
    For i = 1 To UBound(Arr)
        
        PosText = Arr(i)
        
        wdApp.Selection.TypeText Text:=PosText
        wdApp.Selection.TypeParagraph
        
        
        If dImageName.Exists(PosText) Then
            '如果含有图片
            If InStr(dImageName(PosText), "|") = 0 Then
                '如果只含有一张图片
                ImageName = dImageName(PosText)
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                wdApp.Selection.InlineShapes.AddPicture FileName:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                wdApp.Selection.TypeParagraph
            Else
                ImgNames = Split(dImageName(PosText), "|")
                For n = LBound(ImgNames) To UBound(ImgNames) Step 1
                    ImageName = ImgNames(n)
                    ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                    wdApp.Selection.InlineShapes.AddPicture FileName:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                    wdApp.Selection.TypeParagraph
                Next n
            End If
        End If
    Next i
    
    Doc.SaveAs FilePath
    Doc.Close
    
    
    
    Application.StatusBar = ">>>>>>正在删除Image图片>>>>>>"
    
    For Each Key In dImageName.keys
        If InStr(dImageName(Key), "|") = 0 Then
            ImageName = dImageName(Key)
            ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
            Kill ImagePath
        Else
            ImgNames = Split(dImageName(Key), "|")
            For n = LBound(ImgNames) To UBound(ImgNames) Step 1
                ImageName = ImgNames(n)
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & ImageName
                Kill ImagePath
            Next n
        End If
    Next Key
    
    
    UsedTime = VBA.Timer - StartTime
    Debug.Print FileName; "    "; UsedTime
    'MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
    
ErrorExit:
    If Not wdApp Is Nothing Then wdApp.Quit
    Set wdApp = Nothing
    Set Doc = Nothing
    
    AppSettings False
    Exit Sub
ErrHandler:
    If Err.Number <> 0 Then
        'MsgBox Err.Description & "!", vbCritical, "QQ 84857038"
        Debug.Print Err.Description
        Err.Clear
        Resume ErrorExit
    End If
End Sub
Public Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
'传递参数 :原字符串, 匹配模式 ,替换字符
    Dim Regex As Object
    Dim newText As String
    Set Regex = CreateObject("VBScript.RegExp")
    With Regex
        .Global = True
        .Pattern = Pattern
    End With
    newText = Regex.Replace(OrgText, RepStr)
    RegReplace = newText
    Set Regex = Nothing
End Function
Public Sub AppSettings(Optional IsStart As Boolean = True)
    If IsStart Then
        Application.ScreenUpdating = False
        Application.DisplayAlerts = False
        Application.Calculation = xlCalculationManual
        Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>"
    Else
        Application.ScreenUpdating = True
        Application.DisplayAlerts = True
        Application.Calculation = xlCalculationAutomatic
        Application.StatusBar = False
    End If
End Sub

  

原文地址:https://www.cnblogs.com/nextseven/p/8469172.html