russellhq
03-03-2009, 04:41 AM
I have a project that requires importing tables from multiple HTML files into excel.
I first tried using the following;
Set objIE = CreateObject("InternetExplorer.Application")
Then using ie's function to read the cells in the table.
This worked perfectly, but was a little slow due to opening an instance of ie for every html file.
So my next step was to try and write some vba that would parse the html file and extract the contents of the table cells. I came up with this;
Sub ReadHTMLFile()
Dim sText As String 'string to hold html file text
Dim tT As String 'Entire Table string
Dim tR As String 'Table Row string
Dim tC As String 'Table column string
Application.Calculation = xlCalculationManual 'Turn off calculation
Application.ScreenUpdating = False 'Turn off updating
sText = GetText("C:\webtable.html") 'load html file
sText = Replace(sText, " ", Chr(32)) 'replace characters
sText = Replace(sText, Chr(9), "") 'replace characters
sText = Replace(sText, Chr(10), "") 'replace characters
sText = Replace(sText, Chr(13), "") 'replace characters
sText = Replace(sText, """, Chr(34)) 'replace characters
sText = Replace(sText, "<", Chr(60)) 'replace characters
sText = Replace(sText, ">", Chr(62)) 'replace characters
sText = Replace(sText, "&", Chr(38)) 'replace characters
sText = Replace(sText, " ", Chr(32)) 'replace characters
For i = 1 To 255
sText = Replace(sText, "&#" & i & ";", Chr(i)) 'replace characters
Next
tS = "<table" 'table start tag
tE = "/table>" 'table end tag
rS = "<tr" 'row start tag
rE = "/tr>" 'row start tag
cS = "<td" 'column start tag
cE = "/td>" 'column end tag
cC = 1 'current column
cR = 1 'current row
Do While InStr(1, sText, tS, vbTextCompare) > 0 ' Loop while there are still table start tags
sText = Right(sText, Len(sText) - InStr(1, sText, tS, vbTextCompare)) 'Chop all text before the first table start tag
tT = Left(sText, InStr(1, sText, tE, vbTextCompare) - 2) 'Set table string to first table
'Debug.Print tT
Do While InStr(1, tT, rS, vbTextCompare) > 0 'Loop until the last row
tR = Left(tT, InStr(1, tT, rE, vbTextCompare) - 2) 'Set row string
'debug.Print tR
Do While InStr(1, tR, cS, vbTextCompare) > 0 'Loop until last column in row
tR = Right(tR, Len(tR) - InStr(1, tR, cS, vbTextCompare) - 2) 'Chop all text before first column
'debug.Print tR
tC = Left(tR, InStr(1, tR, cE, vbTextCompare) - 1) 'First column text equals start of string to first column end tag
Do While InStr(1, tC, ">", vbTextCompare) > 0 'Loop while there are still end of tag symbols
'debug.Print tC
If InStr(1, tC, ">", vbTextCompare) + 1 < InStr(1, tC, "<", vbTextCompare) Then ' if there is a gap between >< then that's the text I want
Cells(cR, cC) = Right(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1), Len(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1)) - InStr(1, tC, ">", vbTextCompare)) 'Copy text in gap
'Debug.Print Right(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1), Len(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1)) - InStr(1, tC, ">", vbTextCompare))
tC = Right(tC, Len(tC) - InStr(1, tC, ">", vbTextCompare)) ' Chop text from start to first ">"
'Debug.Print tC
Else
If InStr(1, tC, ">", vbTextCompare) > 0 Then ' If no gap but still a ">" then:
tC = Right(tC, Len(tC) - InStr(1, tC, ">", vbTextCompare) + 1) 'Chop all text to left of ">" symbol
If Left(tC, 2) = "><" Then 'Remove >< from start to stop endless loop
tC = Right(tC, Len(tC) - 2)
End If
'Debug.Print tC
End If
End If
Loop
cC = cC + 1 'Count current column
Loop
tT = Right(tT, Len(tT) - InStr(1, tT, rE, vbTextCompare) - 3) 'Chop all table text before next row
cR = cR + 1 'Count current row
cC = 1 'Reset column count for new row
Loop
cR = cR + 1
Cells(cR, 1) = "--------------------------------------------------------------------" 'Show a break between tables
Loop
Application.ScreenUpdating = True 'Turn updating back on
Application.Calculate 'calculate sheet
Application.Calculation = xlCalculationAutomatic 'Turn automatic calculation back on
End Sub
Function GetText(sFile As String) As String
Dim nSourceFile As Integer, sText As String
''Close any open text files
Close
''Get the number of the next free text file
nSourceFile = FreeFile
''Write the entire file to sText
Open sFile For Input As #nSourceFile
sText = Input$(LOF(1), 1)
Close
GetText = sText
End Function
It works OK, faster than ie but still a little slow. Probably due to all the looping.
Does anyone know any faster ways to import multiple tables from multiple HTML files (stored locally). Or can you see a way to speed up my script?
I first tried using the following;
Set objIE = CreateObject("InternetExplorer.Application")
Then using ie's function to read the cells in the table.
This worked perfectly, but was a little slow due to opening an instance of ie for every html file.
So my next step was to try and write some vba that would parse the html file and extract the contents of the table cells. I came up with this;
Sub ReadHTMLFile()
Dim sText As String 'string to hold html file text
Dim tT As String 'Entire Table string
Dim tR As String 'Table Row string
Dim tC As String 'Table column string
Application.Calculation = xlCalculationManual 'Turn off calculation
Application.ScreenUpdating = False 'Turn off updating
sText = GetText("C:\webtable.html") 'load html file
sText = Replace(sText, " ", Chr(32)) 'replace characters
sText = Replace(sText, Chr(9), "") 'replace characters
sText = Replace(sText, Chr(10), "") 'replace characters
sText = Replace(sText, Chr(13), "") 'replace characters
sText = Replace(sText, """, Chr(34)) 'replace characters
sText = Replace(sText, "<", Chr(60)) 'replace characters
sText = Replace(sText, ">", Chr(62)) 'replace characters
sText = Replace(sText, "&", Chr(38)) 'replace characters
sText = Replace(sText, " ", Chr(32)) 'replace characters
For i = 1 To 255
sText = Replace(sText, "&#" & i & ";", Chr(i)) 'replace characters
Next
tS = "<table" 'table start tag
tE = "/table>" 'table end tag
rS = "<tr" 'row start tag
rE = "/tr>" 'row start tag
cS = "<td" 'column start tag
cE = "/td>" 'column end tag
cC = 1 'current column
cR = 1 'current row
Do While InStr(1, sText, tS, vbTextCompare) > 0 ' Loop while there are still table start tags
sText = Right(sText, Len(sText) - InStr(1, sText, tS, vbTextCompare)) 'Chop all text before the first table start tag
tT = Left(sText, InStr(1, sText, tE, vbTextCompare) - 2) 'Set table string to first table
'Debug.Print tT
Do While InStr(1, tT, rS, vbTextCompare) > 0 'Loop until the last row
tR = Left(tT, InStr(1, tT, rE, vbTextCompare) - 2) 'Set row string
'debug.Print tR
Do While InStr(1, tR, cS, vbTextCompare) > 0 'Loop until last column in row
tR = Right(tR, Len(tR) - InStr(1, tR, cS, vbTextCompare) - 2) 'Chop all text before first column
'debug.Print tR
tC = Left(tR, InStr(1, tR, cE, vbTextCompare) - 1) 'First column text equals start of string to first column end tag
Do While InStr(1, tC, ">", vbTextCompare) > 0 'Loop while there are still end of tag symbols
'debug.Print tC
If InStr(1, tC, ">", vbTextCompare) + 1 < InStr(1, tC, "<", vbTextCompare) Then ' if there is a gap between >< then that's the text I want
Cells(cR, cC) = Right(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1), Len(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1)) - InStr(1, tC, ">", vbTextCompare)) 'Copy text in gap
'Debug.Print Right(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1), Len(Left(tC, InStr(1, tC, "<", vbTextCompare) - 1)) - InStr(1, tC, ">", vbTextCompare))
tC = Right(tC, Len(tC) - InStr(1, tC, ">", vbTextCompare)) ' Chop text from start to first ">"
'Debug.Print tC
Else
If InStr(1, tC, ">", vbTextCompare) > 0 Then ' If no gap but still a ">" then:
tC = Right(tC, Len(tC) - InStr(1, tC, ">", vbTextCompare) + 1) 'Chop all text to left of ">" symbol
If Left(tC, 2) = "><" Then 'Remove >< from start to stop endless loop
tC = Right(tC, Len(tC) - 2)
End If
'Debug.Print tC
End If
End If
Loop
cC = cC + 1 'Count current column
Loop
tT = Right(tT, Len(tT) - InStr(1, tT, rE, vbTextCompare) - 3) 'Chop all table text before next row
cR = cR + 1 'Count current row
cC = 1 'Reset column count for new row
Loop
cR = cR + 1
Cells(cR, 1) = "--------------------------------------------------------------------" 'Show a break between tables
Loop
Application.ScreenUpdating = True 'Turn updating back on
Application.Calculate 'calculate sheet
Application.Calculation = xlCalculationAutomatic 'Turn automatic calculation back on
End Sub
Function GetText(sFile As String) As String
Dim nSourceFile As Integer, sText As String
''Close any open text files
Close
''Get the number of the next free text file
nSourceFile = FreeFile
''Write the entire file to sText
Open sFile For Input As #nSourceFile
sText = Input$(LOF(1), 1)
Close
GetText = sText
End Function
It works OK, faster than ie but still a little slow. Probably due to all the looping.
Does anyone know any faster ways to import multiple tables from multiple HTML files (stored locally). Or can you see a way to speed up my script?