Create your own Web Robot

Create your own bot that searches the net for the keywords you enter, it shows the results in the taskbar, you can run the application and continue with your routine work. The bot will show you the results after searching the net and the results will be stored in the database. You can open and run the search again any time. A separate database is created for each individual search. You can learn lots of concepts from this program. Good Luck!!!

For this to work, add 4 buttons to a form. Call all of them Command1, and set their index properties to 0,1,2,3 and 4. (0=New Search, 1=Open Search,2=Run Search,3=Cancel Search,4=Quit)

Option Explicit

Dim gCancel As Boolean

'Threshold for relevance; if a page's relevance is below this, its
'links are not are not added to the search. A setting of 0
'results in all of a page's links being added.

Const RELEVANCE_THRESHOLD = 0

Public Function GetLinks(s As String, baseURL As String) As String

'Passed some html text in s, locates all the hyperlinks and
'returns the URLs in a string seperated by the | character.
'Returns only http links and ignores mailto and ftp.
'baseurl is the base location of the html document that is
'passed in s. If a link is found that is a local link,this is
'added so the return value is a fully qualified URL.

Dim pos As Long, pos1 As Long, pos2 As Long
Dim buf As String, temp As String
Dim sq As String, dq As String
Dim qc As String, start As String

buf = ""

'Make sure a non empty string is passed
If s = Null Or Len(s) = 0 Then
   GetLinks = buf
   Exit Function
End If

'Make sure there is at least one link
start = InStr(1, s, "<a href=", vbTextCompare)
If start = 0 Then
   GetLinks = buf
   Exit Function
End If

'Declare the single and double quote characters.
dq = Chr$(34)
sq = Chr$(39)

Do
   'Get the first double or single quote character.
   'That marks the start of the URL.
   pos = InStr(start, s, dq, vbTextCompare)
   pos2 = InStr(start, s, sq, vbTextCompare)
   If pos = 0 And pos2 = 0 Then Exit Do
   
   'If both a single and double quote were found, the first one marks
   'the start of the URL
   If pos > 0 And pos2 > 0 Then 'both were found
       If pos < pos2 Then 'It is the double quote
           qc = dq
       Else 'It is the single quote
           qc = sq
           pos = pos2
       End If
   ElseIf pos = 0 Then 'Only the single quote was found
       qc = sq
       pos = pos2
   ElseIf pos2 = 0 Then 'Only a double quote was found
       qc = dq
   End If
   
   'Get the next quote character. That marks the end of the URL.
   pos1 = InStr(pos + 1, s, qc, vbTextCompare)
   If pos1 = 0 Then Exit Do
   temp = Mid$(s, pos + 1, pos1 - pos - 1)
   
   'Dont accept mailto and ftp links
   If LCase(Left(temp, 7)) = "mailto:" Or LCase(Left(temp, 3)) = "ftp" Then
       GoTo DoNotAdd
   End If
   
   'See whether it is a full URL. if not, add the base URL
   If LCase(Left(temp, 7)) <> "http://" Then
       temp = baseURL & temp
   End If
   
   'Strip off anything following a # or ? character in the link
   pos = InStr(1, temp, "#")
   
   If pos > 0 Then
       temp = Left(temp, pos - 1)
   End If
   
   pos = InStr(1, temp, "?")
   
   If pos > 0 Then
       temp = Left(temp, pos - 1)
   End If
   
   buf = buf & temp & "|"
   
DoNotAdd:
   'Locate the next link
   pos = InStr(pos1, s, "<a href=", vbTextCompare)
   start = pos
   
   'If there are no more links, quit
   If pos = 0 Then Exit Do
   DoEvents
   
Loop While True

'Strip off the tailing |.
GetLinks = Left(buf, Len(buf) - 1)

End Function

Public Function StripTags(ByVal HtmlDoc As String) As String

'Passed the text of an HTML document, returns the text with all
'HTML tags removed. Specifically, removes all < > pairs and
'the text between them.

Dim buf As String, pos As String

'Check for no tags situation
pos = InStr(HtmlDoc, "<")
If pos = 0 Then
   StripTags = HtmlDoc
   Exit Function
End If

Do
   'Add text to left of first tag to buffer
   buf = buf & Left$(HtmlDoc, pos - 1)
   'Find end of tag
   pos = InStr(HtmlDoc, ">")
   'Remove everything up to the end of tag from the text.
   If pos = 0 Then Exit Do
   HtmlDoc = Mid$(HtmlDoc, pos + 1)
   
   'Find the start of the next tag
   pos = InStr(HtmlDoc, "<")
   If pos = 0 Then Exit Do
Loop While True

StripTags = buf

End Function

Public Function SearchForKW(str As String, kw As String) As Long

'Returns the number of times that the string kw is found in the
'string str, case-insensitive. Maximum count is 10 because higher
'hit rates will probably not mean higher relevance.

Dim buf As String, pos As Long, count As Integer

'Make a local copy minus HTML tags.
buf = StripTags(str)

pos = InStr(1, buf, kw, vbTextCompare)
If pos = 0 Then
   SearchForKW = 0
   Exit Function
End If

count = 0

Do
   count = count + 1
   'Strip off everything up to and including the just-found
   'keyword
   buf = Right(buf, Len(buf) - pos - Len(kw))
   pos = InStr(1, buf, kw, vbTextCompare)
Loop Until count = 10 Or pos = 0

SearchForKW = count

End Function

Private Sub Command1_Click(Index As Integer)

Dim reply As Integer

Select Case Index

   Case 0 'New Search
       Call NewSearch
   
   Case 1 'Open Search
       Call OpenSearch
       
   Case 2 ' Run Search
       Call RunSearch
       reply = MsgBox("Done - display report?", vbYesNo)
       If reply = vbYes Then Call frmReport.Show
       
   Case 3 'Cancel Search
       gCancel = True
       
   Case 4 'quit
       End
       
   End Select
End Sub

Public Sub OpenSearch()
'Opens an existing search database.

Dim gDataFileName

With CD
   .Filter = "Search databases | *.mdb"
   .InitDir = App.Path
   .ShowOpen
   gDataFileName = .FileName
End With

If gDataFileName <> "" Then
   Caption = gDataFileName
   Command1(2).Enabled = True  'Open Search
End If

End Sub

Private Sub Form_Load()

gDataFileName = ""
Command1(0).Enabled = True 'new search
Command1(1).Enabled = True 'open search
Command1(2).Enabled = False 'run search
Command1(3).Enabled = False 'cancel search
Command1(4).Enabled = True 'quit search
gCancel = False

Caption = "No search loaded"

End Sub

Public Sub NewSearch()

'Create a new search

gDataFileName = ""
frmNewSearch.Show 1
If gDataFileName <> "" Then
   Caption = gDataFileName
   Command1(2).Enabled = True
End If

End Sub

Public Sub RunSearch()

'Runs a search based on the currently loaded search data.
'Terminates when MAXLINKS links have been examined or no
'more links are available.

Dim strConnect As String
Dim buf As String
Dim count As Long, relevance As Integer
Dim oldCaption As String

'For keywords and their importance values
Dim kw() As String
Dim impr() As Integer
'Number of keywords
Dim numkw As Integer

On Error Resume Next

'Initialize
numkw = 0
oldCaption = frmMain.Caption

'Enable Cancel button
Command1(3).Enabled = True
'Disable other buttons
Command1(0).Enabled = False
Command1(1).Enabled = False
Command1(2).Enabled = False
Command1(4).Enabled = False

'Create the database objects
strConnect = "Provider=Microsoft.Jet.OLEDB.3.51;"
strConnect = strConnect & "Persist Security Info = false;"
strConnect = strConnect & "Data Source = " & gDataFileName

Set cnLinks = New ADODB.Connection
Set rsURL = New ADODB.Recordset
Set rskeywords = New ADODB.Recordset

cnLinks.ConnectionString = strConnect
cnLinks.ConnectionTimeout = 10
cnLinks.CursorLocation = adUseNone
cnLinks.Open

'Get the keywords from the database and put them in an array.
'Put the importance values in their own array

rskeywords.Open "select * from keywords", cnLinks, _
   adOpenDynamic, adLockOptimistic, adCmdText
   
Do
   numkw = numkw + 1
   ReDim Preserve kw(numkw)
   ReDim Preserve impr(numkw)
   kw(numkw) = rskeywords!keyword
   impr(numkw) = rskeywords!importance
   rskeywords.MoveNext
Loop Until rskeywords.EOF = True

'Destroy the keywords recordset because we are done with it
Set rskeywords = Nothing

'Open the URLs list in the database.
rsURL.Open "Select * from links", cnLinks, _
   adOpenDynamic, adLockOptimistic, adCmdText
   
rsURL.MoveFirst

Do
   'Debug.print "Trying " & rsurl!url & ", " & rsurl!Searched
   If rsURL!searched = DateValue(BASEDATE) Then
       rsURL!searched = Now
       Call SearchURL(rsURL!URL, kw(), impr())
       count = count + 1
       frmMain.Caption = count
       rsURL.MoveFirst
   Else
       rsURL.MoveNext
   End If
   DoEvents
Loop Until count > MAXLINKS Or rsURL.EOF Or gCancel = True

'Clean up
gCancel = False
Command1(3).Enabled = False
Command1(0).Enabled = True
Command1(1).Enabled = True
Command1(2).Enabled = True
Command1(4).Enabled = True
frmMain.Caption = oldCaption

'Destroy the database objects
Set cnLinks = Nothing
Set rsURL = Nothing

End Sub

Public Sub SearchURL(URL As String, kw() As String, impr() As Integer)

'URL is the full url to search
'kw() is a string array of the keywords to look for
'impr() is an integer array of the relevance importance
'of each keyword in kw()

Dim buf As String, i As Integer
Dim rel As Long

On Error Resume Next

'Set the timeout value in seconds
Inet1.RequestTimeout = 30

'Open the URL
buf = Inet1.OpenURL(URL, icString)
DoEvents

'Catch time out and server not found errors
If Err = 35761 Then
   Err.Clear
   Exit Sub
ElseIf buf = "" Then
   Exit Sub
End If

'Search for the various keywords,
'one at a time
rel = 0

For i = 1 To UBound(kw)
   rel = rel + (impr(i) / 5) * SearchForKW(buf, kw(i))
Next i

rel = rel \ UBound(kw)
rsURL!relevance = rel
Debug.Print rsURL!URL & ": " & rsURL!relevance

If rel >= RELEVANCE_THRESHOLD Then
   Call AddLinksToSearch(buf, MakeBaseURL(URL))
End If

End Sub

Public Sub AddLinksToSearch(s As String, baseURL As String)

'Extracts the http:// links from s and adds them to the search
'database, s is the text contents of an HTML file

'Returns true if any links were added to the database, false if not

Dim links As String, newURL As String
Dim pos As Long

'Necessary so that "duplicate data" errors will not stop execution
On Error Resume Next

links = GetLinks(s, baseURL)

'No links to add
If Len(links) < 9 Then
   Exit Sub
End If

pos = 1

Do
   pos = InStr(1, links, "|") 'Remember links are stored with a |
       'seperating them in the list
   If pos = 0 Then Exit Do 'no links are found
   newURL = Left(links, pos - 1) 'if links are found then extract the link and add it to the newurl variable
   links = Right(links, Len(links) - pos) 'After extracting the first
       'link, store the remaining links in the variable called
       'links
   'Open the url recordset and the links to the database
   rsURL.AddNew
   rsURL!URL = newURL
   rsURL!relevance = 0
   rsURL!starting = False 'This link is not the starting url
   rsURL!searched = DateValue(BASEDATE) 'Store the date when this link
       'was searched
   rsURL.Update 'Update the database
   'Uncomment the following debug.print statements to see
   'a list of accepted and duplicate urls in the immediate
   'window while executing in VB.
   If Err.Number = &H80040E21 Then
       'Debug.Print newURL & " rejected as duplicate"
       'Debug.Print "description: " & Err.Description
       MsgBox newURL & " rejected as duplicate"
   Else
       'Debug.Print newURL & " Added"
       MsgBox newURL & " added"
   End If
   DoEvents 'yields execution  so that the operating system
       'can process other events. doevents function forces the
       'processor to use its multitasking capabilities. The current
       'process is run as well as other processes in the queue
       'are taken care of. the control returns to the current
       'application once the task is completed
   
   
Loop While True


End Sub

Public Function MakeBaseURL(URL As String) As String

'The function accepts the complete url, returns the base part - the
'URL minus any filenames and with a trailing "/".

'For Example:

'http://www.abc.com/index.htm --> http://www.abc.com/
'http://www.abc.com/sub/data.htm --> http://www.abc.com/sub
'http://www.abc.com --> http://www.abc.com/

'Remember the trailing "/" is appended to the url

'Declare the necessary variables
Dim s As String, pos As Integer

'look for the last / in the URL
pos = InStrRev(URL, "/")
'InStrRev has a different syntax than the instr function
'Instr function starts searching from the left side or the
'start of the string but instrrev function starts searching
'from the end of the string

'if it is less than 8,it is part of the http:// protocol specifier
'Therefore, we can just tack (tack: stick in) a / at the end and return it.

If pos < 8 Then
   MakeBaseURL = URL & "/"
   Exit Function
   
   'Otherwise, strip off everything after the last /, thus removing
   'the filename.
   
Else
   MakeBaseURL = Left(URL, pos)
End If

End Function

You might also like...

Comments

S.S. Ahmed S.S. Ahmed is a senior IT Professional and works for a web and software development firm. Ahmed is a Microsoft Office SharePoint Server MVP. Ahmed specializes in creating database driven dynamic we...

Contribute

Why not write for us? Or you could submit an event or a user group in your area. Alternatively just tell us what you think!

Our tools

We've got automatic conversion tools to convert C# to VB.NET, VB.NET to C#. Also you can compress javascript and compress css and generate sql connection strings.

“There are only 3 numbers of interest to a computer scientist: 1, 0 and infinity”