Friday, July 21, 2006
Scraping data from NHL.com onto your hard drive
This macro strips the play-by-play sheets for the 0304 regular season off of the NHL.com database and into a directory called C:\NHL\PL0304. You will have to create the directory on your hard drive first, Or you can create a directory with a different name so long as you change the red bit of code below to match it.. By changing PL to GS in the code below you would be scraping the Game Sheets for thst season instead. By changing 20032004 to 20052006 you grab the sheets for theis past season. Simple enough, no?
BTW: I don't actually know how this code works, I just copied it from the net. But it does work for me.
Once you've created the directory on your hard drive, just open Excel, press Alt-F11, press Alt-I, press M, paste everything onto the sheet that shows up. Then press F5 and voila! You're scraping off data!
Little bits of code to come later will let you make use of this stuff you've scraped off in a simple and fast way.
Let me know if this works for anyone. Lowetide ... I'm looking at you. :-)
_____________________________________________________
Public Const INTERNET_OPEN_TYPE_PRECONFIG = 0
Public Const INTERNET_OPEN_TYPE_DIRECT = 1
Public Const INTERNET_OPEN_TYPE_PROXY = 3
Public Const scUserAgent = "VB OpenUrl"
Public Const INTERNET_FLAG_RELOAD = &H80000000
Public Declare Function InternetOpen Lib "wininet.dll" Alias _
"InternetOpenA" (ByVal sAgent As String, ByVal lAccessType _
As Long, ByVal sProxyName As String, ByVal sProxyBypass _
As String, ByVal lFlags As Long) As Long
Public Declare Function InternetOpenUrl Lib "wininet.dll" Alias _
"InternetOpenUrlA" (ByVal hOpen As Long, ByVal sUrl As String, _
ByVal sHeaders As String, ByVal lLength As Long, ByVal lFlags As _
Long, ByVal lContext As Long) As Long
Public Declare Function InternetReadFile Lib "wininet.dll" _
(ByVal hFile As Long, ByVal sBuffer As String, ByVal _
lNumBytesToRead As Long, lNumberOfBytesRead As Long) As Integer
Public Declare Function InternetCloseHandle Lib "wininet.dll" _
(ByVal hInet As Long) As Integer
Public Declare Function GIFToBMP Lib "gif2bmp.dll" _
(ByVal lpstrGIF As String, ByVal lpstrBMP As String) As Integer
Sub GetHTMLFromURL()
Dim sUrl As String
Dim s As String
Dim hOpen As Long
Dim hOpenUrl As Long
Dim bDoLoop As Boolean
Dim bRet As Boolean
Dim sReadBuffer As String * 2048
Dim lNumberOfBytesRead As Long
Dim j As Integer: Dim i As Integer: Dim m As Integer
Dim k As Long
Dim sG() As Byte
On Error GoTo NEXTj
For j = 1 To 1230
m = 20000 + j
s = Empty
sUrl = "http://www.nhl.com/scores/htmlreports/20032004/PL0" & m & ".HTM"
hOpen = InternetOpen(scUserAgent, INTERNET_OPEN_TYPE_PRECONFIG, vbNullString, vbNullString, 0)
hOpenUrl = InternetOpenUrl(hOpen, sUrl, vbNullString, 0, INTERNET_FLAG_RELOAD, 0)
bDoLoop = True
While bDoLoop
sReadBuffer = vbNullString
bRet = InternetReadFile(hOpenUrl, sReadBuffer, Len(sReadBuffer), lNumberOfBytesRead)
s = s & Left$(sReadBuffer, lNumberOfBytesRead)
If Not CBool(lNumberOfBytesRead) Then bDoLoop = False
Wend
If hOpenUrl <> 0 Then InternetCloseHandle (hOpenUrl)
If hOpen <> 0 Then InternetCloseHandle (hOpen)
k = Len(s)
ReDim sG(k) As Byte
For i = 1 To k: sG(i) = Asc(Mid(s, i, 1)): Next i
Open "C:\NHL\PL0304\PL0" & m & ".HTM" For Binary As #44 Len = 1
For i = 1 To k: Put #44, i, sG(i): Next i
Close #44
NEXTj:
Next j
End Sub
BTW: I don't actually know how this code works, I just copied it from the net. But it does work for me.
Once you've created the directory on your hard drive, just open Excel, press Alt-F11, press Alt-I, press M, paste everything onto the sheet that shows up. Then press F5 and voila! You're scraping off data!
Little bits of code to come later will let you make use of this stuff you've scraped off in a simple and fast way.
Let me know if this works for anyone. Lowetide ... I'm looking at you. :-)
_____________________________________________________
Public Const INTERNET_OPEN_TYPE_PRECONFIG = 0
Public Const INTERNET_OPEN_TYPE_DIRECT = 1
Public Const INTERNET_OPEN_TYPE_PROXY = 3
Public Const scUserAgent = "VB OpenUrl"
Public Const INTERNET_FLAG_RELOAD = &H80000000
Public Declare Function InternetOpen Lib "wininet.dll" Alias _
"InternetOpenA" (ByVal sAgent As String, ByVal lAccessType _
As Long, ByVal sProxyName As String, ByVal sProxyBypass _
As String, ByVal lFlags As Long) As Long
Public Declare Function InternetOpenUrl Lib "wininet.dll" Alias _
"InternetOpenUrlA" (ByVal hOpen As Long, ByVal sUrl As String, _
ByVal sHeaders As String, ByVal lLength As Long, ByVal lFlags As _
Long, ByVal lContext As Long) As Long
Public Declare Function InternetReadFile Lib "wininet.dll" _
(ByVal hFile As Long, ByVal sBuffer As String, ByVal _
lNumBytesToRead As Long, lNumberOfBytesRead As Long) As Integer
Public Declare Function InternetCloseHandle Lib "wininet.dll" _
(ByVal hInet As Long) As Integer
Public Declare Function GIFToBMP Lib "gif2bmp.dll" _
(ByVal lpstrGIF As String, ByVal lpstrBMP As String) As Integer
Sub GetHTMLFromURL()
Dim sUrl As String
Dim s As String
Dim hOpen As Long
Dim hOpenUrl As Long
Dim bDoLoop As Boolean
Dim bRet As Boolean
Dim sReadBuffer As String * 2048
Dim lNumberOfBytesRead As Long
Dim j As Integer: Dim i As Integer: Dim m As Integer
Dim k As Long
Dim sG() As Byte
On Error GoTo NEXTj
For j = 1 To 1230
m = 20000 + j
s = Empty
sUrl = "http://www.nhl.com/scores/htmlreports/20032004/PL0" & m & ".HTM"
hOpen = InternetOpen(scUserAgent, INTERNET_OPEN_TYPE_PRECONFIG, vbNullString, vbNullString, 0)
hOpenUrl = InternetOpenUrl(hOpen, sUrl, vbNullString, 0, INTERNET_FLAG_RELOAD, 0)
bDoLoop = True
While bDoLoop
sReadBuffer = vbNullString
bRet = InternetReadFile(hOpenUrl, sReadBuffer, Len(sReadBuffer), lNumberOfBytesRead)
s = s & Left$(sReadBuffer, lNumberOfBytesRead)
If Not CBool(lNumberOfBytesRead) Then bDoLoop = False
Wend
If hOpenUrl <> 0 Then InternetCloseHandle (hOpenUrl)
If hOpen <> 0 Then InternetCloseHandle (hOpen)
k = Len(s)
ReDim sG(k) As Byte
For i = 1 To k: sG(i) = Asc(Mid(s, i, 1)): Next i
Open "C:\NHL\PL0304\PL0" & m & ".HTM" For Binary As #44 Len = 1
For i = 1 To k: Put #44, i, sG(i): Next i
Close #44
NEXTj:
Next j
End Sub
Comments:
<< Home
Thanks, Vic. I won't have a chance to do it right away, but I'll try after the weekend!
God, this will save me hours.
God, this will save me hours.
Vic: I'm having some trouble getting that code to work for the event sheets. It has an overflow error after about 1000 event sheets and stops. Do you know how to fix that easily? Programming is not my bag.
riversq:
I don't know. It works for me. Try this code instead:
On Error GoTo NEXTj
For j = 1 To 1230
m = 20000 + j
Open "C:\NHL\ES0506\ES0" & m & ".HTM" For Binary As #44 Len = 1
sUrl = "http://www.nhl.com/scores/htmlreports/20052006/ES0" & m & ".HTM"
hOpen = InternetOpen(scUserAgent, INTERNET_OPEN_TYPE_PRECONFIG, vbNullString, vbNullString, 0)
hOpenUrl = InternetOpenUrl(hOpen, sUrl, vbNullString, 0, INTERNET_FLAG_RELOAD, 0)
bDoLoop = True
While bDoLoop
sReadBuffer = vbNullString
bRet = InternetReadFile(hOpenUrl, sReadBuffer, Len(sReadBuffer), lNumberOfBytesRead)
Put #44, , Left$(sReadBuffer, lNumberOfBytesRead)
If Not CBool(lNumberOfBytesRead) Then bDoLoop = False
Wend
If hOpenUrl <> 0 Then InternetCloseHandle (hOpenUrl)
If hOpen <> 0 Then InternetCloseHandle (hOpen)
Close #44
NEXTj:
Next j
.
This eliminates the sG array. Which is useful for other things (like picking off data that is used to make an only chart) but it isn't necessary here and if we remove it then it simplifies things a bit.
Let me know how it goes.
I don't know. It works for me. Try this code instead:
On Error GoTo NEXTj
For j = 1 To 1230
m = 20000 + j
Open "C:\NHL\ES0506\ES0" & m & ".HTM" For Binary As #44 Len = 1
sUrl = "http://www.nhl.com/scores/htmlreports/20052006/ES0" & m & ".HTM"
hOpen = InternetOpen(scUserAgent, INTERNET_OPEN_TYPE_PRECONFIG, vbNullString, vbNullString, 0)
hOpenUrl = InternetOpenUrl(hOpen, sUrl, vbNullString, 0, INTERNET_FLAG_RELOAD, 0)
bDoLoop = True
While bDoLoop
sReadBuffer = vbNullString
bRet = InternetReadFile(hOpenUrl, sReadBuffer, Len(sReadBuffer), lNumberOfBytesRead)
Put #44, , Left$(sReadBuffer, lNumberOfBytesRead)
If Not CBool(lNumberOfBytesRead) Then bDoLoop = False
Wend
If hOpenUrl <> 0 Then InternetCloseHandle (hOpenUrl)
If hOpen <> 0 Then InternetCloseHandle (hOpen)
Close #44
NEXTj:
Next j
.
This eliminates the sG array. Which is useful for other things (like picking off data that is used to make an only chart) but it isn't necessary here and if we remove it then it simplifies things a bit.
Let me know how it goes.
I'm working on scraping some nhl data. But can someone tell me what SHOT(!) or SHOT(*) or HIT (!) or HIT(*) means in nhl stat play-by-plays?
Post a Comment
<< Home