vba 网页数据抓取(HTML代码应将所选页面的所有数据复制到sheet1链接 )
优采云 发布时间: 2022-04-01 19:07vba 网页数据抓取(HTML代码应将所选页面的所有数据复制到sheet1链接
)
我不确定错误来自哪里,我明白了。
以下代码应该会有所帮助,它将指定页面的表格内容打印到调试窗口。
以下代码应将所选页面的所有数据复制到 sheet1
您需要在 VBA 编辑器中添加一些引用才能使用它。(工具菜单,参考,然后找到并选择它们)Microsoft HTML 对象库和 Microsoft Internet 控件
Const MaxPage = 2 ' set to 26 (or however many there are) - at 2 for testing purposes
Dim Browser As InternetExplorer
Sub Start()
Dim Page As Integer: Page = 1 ' start at page 1
Dim PageDocument As IHTMLDocument
Dim RecordRow As IHTMLElementCollection
Dim RecordItem As IHTMLElement
Dim Sheet As Worksheet: Set Sheet = ThisWorkbook.Worksheets("Sheet1") ' output sheet
If Browser Is Nothing Then
Set Browser = New InternetExplorer
End If
Dim oRow As Integer: oRow = 2 ' begin output at row 2 (account for header)
Dim Record As Integer
For Page = 1 To MaxPage
LoadPage Page
For Record = 0 To 99 ' zero index, 100 items (1-99)
Set PageDocument = Browser.Document
Set RecordRow = PageDocument.getElementById("toplists").getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")(Record).getElementsByTagName("td")
Sheet.Cells(oRow, 1).Value = Trim(RecordRow(0).innerText)
Sheet.Cells(oRow, 2).Value = Trim(RecordRow(1).innerText)
Sheet.Cells(oRow, 3).Value = Trim(RecordRow(2).innerText)
Sheet.Cells(oRow, 4).Value = Trim(RecordRow(3).innerText)
Sheet.Cells(oRow, 5).Value = Trim(RecordRow(4).innerText)
Sheet.Cells(oRow, 6).Value = Trim(RecordRow(5).innerText)
oRow = oRow + 1
Next Record
Next Page
Browser.Quit
End Sub
Sub LoadPage(ByVal PageNumber As Integer)
Debug.Print "Navigating to Page #" & CStr(PageNumber)
Browser.navigate "https://www.worldathletics.org/world-rankings/100m/men?page=" & CStr(PageNumber)
While Browser.readyState 4 Or Browser.Busy: DoEvents: Wend
Debug.Print "Navigation Complete"
End Sub
更新代码
Index Out-of-Bound 错误很可能是由于硬编码的索引,如果页面没有 99 条记录则失败,或者如果记录没有 5 个字段则失败。下面的代码去除了索引,只抓取了它找到的每一行和单元格。你不应该得到索引错误,但输出可能是锯齿状的。
进一步更新
462 错误是由 Browser.Quit 引起的。这会关闭浏览器,但不会设置对 Nothing 的引用,因此当您再次运行代码时,它会尝试使用不存在的浏览器。最后将其显式设置为 null 可以解决此问题。
竞争对手列中没有链接,整行都有一个由其他东西处理的数据 url。但是,可以轻松访问该 URL。
Sub NewStart()
Dim PageDocument As IHTMLDocument
Dim Records As IHTMLElementCollection
Dim Record As IHTMLElement
Dim RecordItems As IHTMLElementCollection
Dim RecordItem As IHTMLElement
Dim OutputRow As Integer: OutputRow = 2
Dim OutputColumn As Integer
Dim Page As Integer
Dim Sheet As Worksheet: Set Sheet = ThisWorkbook.Worksheets("Sheet1")
If Browser Is Nothing Then
Set Browser = New InternetExplorer
Browser.Visible = True
End If
For Page = 1 To MaxPage
LoadPage Page
Set PageDocument = Browser.Document
Set Records = PageDocument.getElementById("toplists").getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
For Each Record In Records
Set RecordItems = Record.getElementsByTagName("td")
OutputColumn = 1
For Each RecordItem In RecordItems
Sheet.Cells(OutputRow, OutputColumn).Value = Trim(RecordItem.innerText)
OutputColumn = OutputColumn + 1
Next RecordItem
Sheet.Cells(OutputRow, OutputColumn).Value = "http://worldathletics.org/" & Record.getAttribute("data-athlete-url") ' This will add the link after the last column
OutputRow = OutputRow + 1
Next Record
Next Page
Browser.Quit
Set Browser = Nothing ' This will fix the 462 error
End Sub