Library code snippets

Create your own Web Robot

Create your own bot that searches the net for the keywords you enter, it shows the results in the taskbar, you can run the application and continue with your routine work. The bot will show you the results after searching the net and the results will be stored in the database. You can open and run the search again any time. A separate database is created for each individual search. You can learn lots of concepts from this program. Good Luck!!!

For this to work, add 4 buttons to a form. Call all of them Command1, and set their index properties to 0,1,2,3 and 4. (0=New Search, 1=Open Search,2=Run Search,3=Cancel Search,4=Quit)

Option Explicit

Dim gCancel As Boolean

'Threshold for relevance; if a page's relevance is below this, its
'links are not are not added to the search. A setting of 0
'results in all of a page's links being added.

Const RELEVANCE_THRESHOLD = 0

Public Function GetLinks(s As String, baseURL As String) As String

'Passed some html text in s, locates all the hyperlinks and
'returns the URLs in a string seperated by the | character.
'Returns only http links and ignores mailto and ftp.
'baseurl is the base location of the html document that is
'passed in s. If a link is found that is a local link,this is
'added so the return value is a fully qualified URL.

Dim pos As Long, pos1 As Long, pos2 As Long
Dim buf As String, temp As String
Dim sq As String, dq As String
Dim qc As String, start As String

buf = ""

'Make sure a non empty string is passed
If s = Null Or Len(s) = 0 Then
   GetLinks = buf
   Exit Function
End If

'Make sure there is at least one link
start = InStr(1, s, "<a href=", vbTextCompare)
If start = 0 Then
   GetLinks = buf
   Exit Function
End If

'Declare the single and double quote characters.
dq = Chr$(34)
sq = Chr$(39)

Do
   'Get the first double or single quote character.
   'That marks the start of the URL.
   pos = InStr(start, s, dq, vbTextCompare)
   pos2 = InStr(start, s, sq, vbTextCompare)
   If pos = 0 And pos2 = 0 Then Exit Do
   
   'If both a single and double quote were found, the first one marks
   'the start of the URL
   If pos > 0 And pos2 > 0 Then 'both were found
       If pos < pos2 Then 'It is the double quote
           qc = dq
       Else 'It is the single quote
           qc = sq
           pos = pos2
       End If
   ElseIf pos = 0 Then 'Only the single quote was found
       qc = sq
       pos = pos2
   ElseIf pos2 = 0 Then 'Only a double quote was found
       qc = dq
   End If
   
   'Get the next quote character. That marks the end of the URL.
   pos1 = InStr(pos + 1, s, qc, vbTextCompare)
   If pos1 = 0 Then Exit Do
   temp = Mid$(s, pos + 1, pos1 - pos - 1)
   
   'Dont accept mailto and ftp links
   If LCase(Left(temp, 7)) = "mailto:" Or LCase(Left(temp, 3)) = "ftp" Then
       GoTo DoNotAdd
   End If
   
   'See whether it is a full URL. if not, add the base URL
   If LCase(Left(temp, 7)) <> "http://" Then
       temp = baseURL & temp
   End If
   
   'Strip off anything following a # or ? character in the link
   pos = InStr(1, temp, "#")
   
   If pos > 0 Then
       temp = Left(temp, pos - 1)
   End If
   
   pos = InStr(1, temp, "?")
   
   If pos > 0 Then
       temp = Left(temp, pos - 1)
   End If
   
   buf = buf & temp & "|"
   
DoNotAdd:
   'Locate the next link
   pos = InStr(pos1, s, "<a href=", vbTextCompare)
   start = pos
   
   'If there are no more links, quit
   If pos = 0 Then Exit Do
   DoEvents
   
Loop While True

'Strip off the tailing |.
GetLinks = Left(buf, Len(buf) - 1)

End Function

Public Function StripTags(ByVal HtmlDoc As String) As String

'Passed the text of an HTML document, returns the text with all
'HTML tags removed. Specifically, removes all < > pairs and
'the text between them.

Dim buf As String, pos As String

'Check for no tags situation
pos = InStr(HtmlDoc, "<")
If pos = 0 Then
   StripTags = HtmlDoc
   Exit Function
End If

Do
   'Add text to left of first tag to buffer
   buf = buf & Left$(HtmlDoc, pos - 1)
   'Find end of tag
   pos = InStr(HtmlDoc, ">")
   'Remove everything up to the end of tag from the text.
   If pos = 0 Then Exit Do
   HtmlDoc = Mid$(HtmlDoc, pos + 1)
   
   'Find the start of the next tag
   pos = InStr(HtmlDoc, "<")
   If pos = 0 Then Exit Do
Loop While True

StripTags = buf

End Function

Public Function SearchForKW(str As String, kw As String) As Long

'Returns the number of times that the string kw is found in the
'string str, case-insensitive. Maximum count is 10 because higher
'hit rates will probably not mean higher relevance.

Dim buf As String, pos As Long, count As Integer

'Make a local copy minus HTML tags.
buf = StripTags(str)

pos = InStr(1, buf, kw, vbTextCompare)
If pos = 0 Then
   SearchForKW = 0
   Exit Function
End If

count = 0

Do
   count = count + 1
   'Strip off everything up to and including the just-found
   'keyword
   buf = Right(buf, Len(buf) - pos - Len(kw))
   pos = InStr(1, buf, kw, vbTextCompare)
Loop Until count = 10 Or pos = 0

SearchForKW = count

End Function

Private Sub Command1_Click(Index As Integer)

Dim reply As Integer

Select Case Index

   Case 0 'New Search
       Call NewSearch
   
   Case 1 'Open Search
       Call OpenSearch
       
   Case 2 ' Run Search
       Call RunSearch
       reply = MsgBox("Done - display report?", vbYesNo)
       If reply = vbYes Then Call frmReport.Show
       
   Case 3 'Cancel Search
       gCancel = True
       
   Case 4 'quit
       End
       
   End Select
End Sub

Public Sub OpenSearch()
'Opens an existing search database.

Dim gDataFileName

With CD
   .Filter = "Search databases | *.mdb"
   .InitDir = App.Path
   .ShowOpen
   gDataFileName = .FileName
End With

If gDataFileName <> "" Then
   Caption = gDataFileName
   Command1(2).Enabled = True  'Open Search
End If

End Sub

Private Sub Form_Load()

gDataFileName = ""
Command1(0).Enabled = True 'new search
Command1(1).Enabled = True 'open search
Command1(2).Enabled = False 'run search
Command1(3).Enabled = False 'cancel search
Command1(4).Enabled = True 'quit search
gCancel = False

Caption = "No search loaded"

End Sub

Public Sub NewSearch()

'Create a new search

gDataFileName = ""
frmNewSearch.Show 1
If gDataFileName <> "" Then
   Caption = gDataFileName
   Command1(2).Enabled = True
End If

End Sub

Public Sub RunSearch()

'Runs a search based on the currently loaded search data.
'Terminates when MAXLINKS links have been examined or no
'more links are available.

Dim strConnect As String
Dim buf As String
Dim count As Long, relevance As Integer
Dim oldCaption As String

'For keywords and their importance values
Dim kw() As String
Dim impr() As Integer
'Number of keywords
Dim numkw As Integer

On Error Resume Next

'Initialize
numkw = 0
oldCaption = frmMain.Caption

'Enable Cancel button
Command1(3).Enabled = True
'Disable other buttons
Command1(0).Enabled = False
Command1(1).Enabled = False
Command1(2).Enabled = False
Command1(4).Enabled = False

'Create the database objects
strConnect = "Provider=Microsoft.Jet.OLEDB.3.51;"
strConnect = strConnect & "Persist Security Info = false;"
strConnect = strConnect & "Data Source = " & gDataFileName

Set cnLinks = New ADODB.Connection
Set rsURL = New ADODB.Recordset
Set rskeywords = New ADODB.Recordset

cnLinks.ConnectionString = strConnect
cnLinks.ConnectionTimeout = 10
cnLinks.CursorLocation = adUseNone
cnLinks.Open

'Get the keywords from the database and put them in an array.
'Put the importance values in their own array

rskeywords.Open "select * from keywords", cnLinks, _
   adOpenDynamic, adLockOptimistic, adCmdText
   
Do
   numkw = numkw + 1
   ReDim Preserve kw(numkw)
   ReDim Preserve impr(numkw)
   kw(numkw) = rskeywords!keyword
   impr(numkw) = rskeywords!importance
   rskeywords.MoveNext
Loop Until rskeywords.EOF = True

'Destroy the keywords recordset because we are done with it
Set rskeywords = Nothing

'Open the URLs list in the database.
rsURL.Open "Select * from links", cnLinks, _
   adOpenDynamic, adLockOptimistic, adCmdText
   
rsURL.MoveFirst

Do
   'Debug.print "Trying " & rsurl!url & ", " & rsurl!Searched
   If rsURL!searched = DateValue(BASEDATE) Then
       rsURL!searched = Now
       Call SearchURL(rsURL!URL, kw(), impr())
       count = count + 1
       frmMain.Caption = count
       rsURL.MoveFirst
   Else
       rsURL.MoveNext
   End If
   DoEvents
Loop Until count > MAXLINKS Or rsURL.EOF Or gCancel = True

'Clean up
gCancel = False
Command1(3).Enabled = False
Command1(0).Enabled = True
Command1(1).Enabled = True
Command1(2).Enabled = True
Command1(4).Enabled = True
frmMain.Caption = oldCaption

'Destroy the database objects
Set cnLinks = Nothing
Set rsURL = Nothing

End Sub

Public Sub SearchURL(URL As String, kw() As String, impr() As Integer)

'URL is the full url to search
'kw() is a string array of the keywords to look for
'impr() is an integer array of the relevance importance
'of each keyword in kw()

Dim buf As String, i As Integer
Dim rel As Long

On Error Resume Next

'Set the timeout value in seconds
Inet1.RequestTimeout = 30

'Open the URL
buf = Inet1.OpenURL(URL, icString)
DoEvents

'Catch time out and server not found errors
If Err = 35761 Then
   Err.Clear
   Exit Sub
ElseIf buf = "" Then
   Exit Sub
End If

'Search for the various keywords,
'one at a time
rel = 0

For i = 1 To UBound(kw)
   rel = rel + (impr(i) / 5) * SearchForKW(buf, kw(i))
Next i

rel = rel \ UBound(kw)
rsURL!relevance = rel
Debug.Print rsURL!URL & ": " & rsURL!relevance

If rel >= RELEVANCE_THRESHOLD Then
   Call AddLinksToSearch(buf, MakeBaseURL(URL))
End If

End Sub

Public Sub AddLinksToSearch(s As String, baseURL As String)

'Extracts the http:// links from s and adds them to the search
'database, s is the text contents of an HTML file

'Returns true if any links were added to the database, false if not

Dim links As String, newURL As String
Dim pos As Long

'Necessary so that "duplicate data" errors will not stop execution
On Error Resume Next

links = GetLinks(s, baseURL)

'No links to add
If Len(links) < 9 Then
   Exit Sub
End If

pos = 1

Do
   pos = InStr(1, links, "|") 'Remember links are stored with a |
       'seperating them in the list
   If pos = 0 Then Exit Do 'no links are found
   newURL = Left(links, pos - 1) 'if links are found then extract the link and add it to the newurl variable
   links = Right(links, Len(links) - pos) 'After extracting the first
       'link, store the remaining links in the variable called
       'links
   'Open the url recordset and the links to the database
   rsURL.AddNew
   rsURL!URL = newURL
   rsURL!relevance = 0
   rsURL!starting = False 'This link is not the starting url
   rsURL!searched = DateValue(BASEDATE) 'Store the date when this link
       'was searched
   rsURL.Update 'Update the database
   'Uncomment the following debug.print statements to see
   'a list of accepted and duplicate urls in the immediate
   'window while executing in VB.
   If Err.Number = &H80040E21 Then
       'Debug.Print newURL & " rejected as duplicate"
       'Debug.Print "description: " & Err.Description
       MsgBox newURL & " rejected as duplicate"
   Else
       'Debug.Print newURL & " Added"
       MsgBox newURL & " added"
   End If
   DoEvents 'yields execution  so that the operating system
       'can process other events. doevents function forces the
       'processor to use its multitasking capabilities. The current
       'process is run as well as other processes in the queue
       'are taken care of. the control returns to the current
       'application once the task is completed
   
   
Loop While True


End Sub

Public Function MakeBaseURL(URL As String) As String

'The function accepts the complete url, returns the base part - the
'URL minus any filenames and with a trailing "/".

'For Example:

'http://www.abc.com/index.htm --> http://www.abc.com/
'http://www.abc.com/sub/data.htm --> http://www.abc.com/sub
'http://www.abc.com --> http://www.abc.com/

'Remember the trailing "/" is appended to the url

'Declare the necessary variables
Dim s As String, pos As Integer

'look for the last / in the URL
pos = InStrRev(URL, "/")
'InStrRev has a different syntax than the instr function
'Instr function starts searching from the left side or the
'start of the string but instrrev function starts searching
'from the end of the string

'if it is less than 8,it is part of the http:// protocol specifier
'Therefore, we can just tack (tack: stick in) a / at the end and return it.

If pos < 8 Then
   MakeBaseURL = URL & "/"
   Exit Function
   
   'Otherwise, strip off everything after the last /, thus removing
   'the filename.
   
Else
   MakeBaseURL = Left(URL, pos)
End If

End Function

Comments

  1. 13 Apr 2007 at 10:47

    4 Buttons - Hmm  for the newbie that is lost here is something you might wish to know.

     

    0,1,2,3,4 = 5 Buttons to me...

  2. 28 Oct 2004 at 18:48

    how do you make a robot anyway, im new so im lost.

  3. 21 Jan 2003 at 02:44

    I was attempting to run the program and ran into a few problems I was hoping someone could help me out with.
    First problem is the CnLinks.Open
    Code:
    strConnect = "Provider=Microsoft.Jet.OLEDB.3.51;"
    strConnect = strConnect & "Persist Security Info=False;"
    strConnect = strConnect & "Data Source = " & gDataFileName


    Set cnLinks = New ADODB.Connection
    Set rsURL = New ADODB.Recordset


    cnLinks.ConnectionString = strConnect
    cnLinks.ConnectionTimeout = 10
    cnLinks.CursorLocation = adUseNone
    cnLinks.Open


    I'm also having a problem with the Inet1.RequestTimeout = 30
    and all the other Inet1 calls.. It gives me the error of variable not defined.



    There are a few things that do not work with this program.. Please can someone help me out here..
    I would greatly appreciate any help at all
       - Ben

  4. 20 Dec 2002 at 00:58
    Thanks for using this code and posting comments. I have rechecked the code, all the files are available. The form you mentioned is also available, the file name for that form is "frmSearch.frm" but i have named it as "frmNewSearch" (in code).

    Thanks,

    S.S
  5. 14 Dec 2002 at 17:45

    you havn't submitted all of the resources (e.g. the code links to a form called frmNewSearch which doesn't exist

  6. 21 Oct 2002 at 09:59

    Still debugging this myself. Instructs are a bit on the pants side!


    To cure the CD snag you need to add Microsoft Common Dialog from Project->Components menu option.


    This will add an icon to left hand toolbar. Drag this onto form, highlight it and change the Name in properties to CD.


    Still mucking about, seems to be a report form missing. Might post further if I can be bothered to get it working.


    DZ

  7. 13 Sep 2002 at 05:20

    About the Web robot...


    I can't get it to work. Plenty of objects are missing when I run it. What am I doing wrong? I Copy/pased the code, added the buttons as suggested, but nothing...


    Object missing when I push New Search => frmNewSearch.Show1 in New Search Sub.


    Object missing when I push Open Search => With CD
      .Filter = "Search databases | *.mdb"


    I'm new at VB-programming so there might be an obvious mistake I just don't know about...
    Please help



  8. 01 Jan 1999 at 00:00

    This thread is for discussions of Create your own Web Robot.

Leave a comment

Sign in or Join us (it's free).

S.S. Ahmed S.S. Ahmed is a senior IT Professional and works for a web and software development firm. Ahmed is a Microsoft Office SharePoint Server MVP. Ahmed specializes in creating database driven dynamic we...

Related discussion

Related podcasts

  • Christian Beauclair

    14 mai 2008 (�mission #0074) ::.Christian Beauclair: Stratégies de migration VB6 vers .NET Nous discutons avec Christian Beauclair des stratégies de migration VB6 vers .NET. Entre autres, nous discutons comment utiliser le "VB 6 Code Advisor" et le "Interop Forms Toolkit" pour ajouter la puiss...

Want to stay in touch with what's going on? Follow us on twitter!