- Excel Version
- 2016
When performing Web scraping, sometimes it is necessary to analyse how the page was constructed. Here are the main features of this post:
- Lists the XPath for all pairs of parent and child for a local HTML test file.
- Informs how many levels the page has and the tag for each element.
- When tested with an actual Web page(www.mrexcel.com), it totalized 339 elements arranged on 12 levels.
Child | Parent | Tag | Level |
//head | // | head | 1 |
//head/meta | //head | meta | 2 |
//head/title | //head | title | 2 |
//body | // | body | 1 |
//*[@id='b1'] | //body | input | 2 |
//*[@id='f1'] | //body | form | 2 |
//*[@id='d1'] | //*[@id='f1'] | div | 3 |
//*[@id='d2'] | //*[@id='d1'] | div | 4 |
//*[@id='d3'] | //*[@id='d1'] | div | 4 |
//*[@id='d4'] | //*[@id='d3'] | div | 5 |
//*[@id='d5'] | //*[@id='d3'] | div | 5 |
//*[@id='d6'] | //*[@id='d3'] | div | 5 |
//*[@id='d7'] | //*[@id='d6'] | div | 6 |
//*[@id='d8'] | //*[@id='d6'] | div | 6 |
//*[@id='d9'] | //*[@id='d6'] | div | 6 |
//*[@id='d10'] | //*[@id='d6'] | div | 6 |
//*[@id='s1'] | //*[@id='d10'] | span | 7 |
VBA Code:
Private Const JS_XPATH = _
"var e=this,p=[];for(;e&&e.nodeType==1&&e.nodeName!='HTML';e=e.parentNode){if(e." & _
"id){p.unshift('*[@id=\''+e.id+'\']');break;}var i=1,u=1,t=e.localName,c=e.class" & _
"Name;for(var s=e.previousSibling;s;s=s.previousSibling){if(s.nodeType!=10&&s.no" & _
"deName==e.nodeName){if(c==s.className)c=null;u=0;++i;}}for(var s=e.nextSibling;" & _
"s;s=s.nextSibling){if(s.nodeName==e.nodeName){if(c==s.className)c=null;u=0;}}p." & _
"unshift(u?t:(t+(c?'[@class=\''+c+'\']':'['+i+']')));}return '//'+p.join('/');"
Sub Main()
Dim aa(), j%, lr%
Crawl
lr = Range("a:a").Find("*", SearchOrder:=xlByRows, SearchDirection:=xlPrevious).Row
FindVal "//", 0, Range("b:b")
Do
Sec j, Range("d:d"), Range("b:b")
j = j + 1
Loop While Evaluate("=counta(d2:d" & lr & ")") < lr - 1
End Sub
Sub FindVal(what$, lv%, r As Range)
Dim c As Range, fa$
Set c = r.Find(what, LookIn:=xlValues, lookat:=xlWhole)
If Not c Is Nothing Then
fa = c.Address
Do
Cells(c.Row, 4) = lv + 1 ' one level down
Set c = r.FindNext(c)
Loop While Not c Is Nothing And c.Address <> fa
End If
End Sub
Sub CrArr(what%, r As Range, arr())
Dim c As Range, fa$, i%
ReDim Preserve arr(1 To 1)
i = 1
Set c = r.Find(what, LookIn:=xlValues, lookat:=xlWhole)
If Not c Is Nothing Then
fa = c.Address
Do
arr(i) = c.Address
ReDim Preserve arr(1 To (i + 1))
i = i + 1
Set c = r.FindNext(c)
Loop While Not c Is Nothing And c.Address <> fa
End If
End Sub
Sub Sec(what%, r As Range, rr As Range)
Dim fb$, d As Range, aa(), i%
CrArr what, Range("d:d"), aa
For i = LBound(aa) To UBound(aa) - 1
FindVal Cells(Range(aa(i)).Row, 1), Range(aa(i)).Value, rr
Next
End Sub
Sub Crawl()
Dim bot As ChromeDriver, els As WebElements, chd As WebElements, i%
Set bot = New ChromeDriver
Rem bot.Get "https://www.mrexcel.com"
bot.Get "C:\Users\en_sa\Documents\tree.htm" ' local file
Set els = bot.FindElementsByTag("html")
Set chd = els(1).FindElementsByXPath(".//*")
For i = 1 To chd.Count
Cells(i + 1, 1) = chd(i).ExecuteScript(JS_XPATH) ' build XPath
Cells(i + 1, 2) = chd(i).FindElementByXPath("./..").ExecuteScript(JS_XPATH) 'parent
Cells(i + 1, 3) = chd(i).tagname
Next
End Sub
The inspiration for this article came from the still unsolved thread below. The page HTML keeps changing, probably to prevent scraping, but there seems to be a pattern. Maybe by cataloguing everything we can find a way to fetch the desired data.
I need to go back to the thread to test it.
https://www.mrexcel.com/board/threads/automatically-extract-website-data-to-a-spreadsheet.1196610/page-8#post-5920898