GetFileInfo Method
Here the chunk of the data in the page is retrieved. The file content is either
read from the source if the files are static using the GetStaticFileContent
method. If the files are dynamic then content is retreived from server using
the GetDynamicFileContent
method. Title information is retrieved
from the title tags, and description and keywords from meta tags by calling
the GetMetaContent
method. Contents of the file is stripped from
the HTML page by calling Searchs.CleanHtml.Clean
method.
'**********************************************
'
' GetFileInfo Method
'
' File data is picked in this method
'
'**********************************************
Public Shared Sub GetFileInfo(ByVal FPath As String, _
ByVal srchFile As Searchs.Page)
Dim fileInform As New FileInfo(FPath)
Dim strBldFile As New StringBuilder()
Dim fileSize As Decimal = fileInform.Length \ 1024
srchFile.Size = fileSize
GetFilePath(FPath, srchFile)
If InStr(1, Searchs.Site.DynamicFilesTypesToSearch, fileInform.Extension, vbTextCompare) > 0 Then
m_page.Trace.Warn("Path", String.Format("{0}/{1}", "", srchFile.Path))
GetDynamicFileContent(srchFile)
Else
GetStaticFileContent(FPath, srchFile)
End If
If Not srchFile.Contents.Equals("") Then
srchFile.Contents = sr.ReadToEnd()
'Read in the title of the file
srchFile.Title = GetMetaContent(srchFile.Contents,_
"<title>", "</title>")
'm_page.Trace.Warn("Page Title", strPageTitle)
'Read in the description meta tag of the file
srchFile.Description = GetMetaContent(srchFile.Contents,_
"<meta name=""description"" content=""", ",""">")
'm_page.Trace.Warn("Page Desc", strPageDescription)
'Read in the keywords of the file
srchFile.Keywords = GetMetaContent(srchFile.Contents,_
"<meta name=""keywords"" content=""", ",""">")
'm_page.Trace.Warn("Page Keywords", strPageKeywords)
srchFile.Contents = _
Searchs.CleanHtml.Clean(srchFile.Contents)
srchFile.Contents = _
strBldFile.AppendFormat("{0} {1} {2} {3}", _
srchFile.Contents, srchFile.Description, _
srchFile.Keywords, srchFile.Title).ToString.Trim()
'm_page.Trace.Warn("File Info", strBldFile.ToString)
End If
End Sub
'*********************************************************************
'
' GetStaticFileContent Method
'
' File Content is picked in this method
'
'*********************************************************************
Private Shared Sub GetStaticFileContent(ByVal FPath As String, ByVal srchFile As Searchs.Page)
Dim sr As StreamReader
If Searchs.Site.Encoding.Equals("utf-8") Then
sr = File.OpenText(FPath)
Else
sr = New StreamReader(FPath, Encoding.GetEncoding(Searchs.Site.Encoding))
End If
Try
srchFile.Contents = sr.ReadToEnd()
sr.Close()
Catch ex As Exception
m_page.Trace.Warn("Error", ex.Message)
srchFile.Contents = ex.Message
End Try
End Sub
GetDynamicFileContent
GetDynamicFileContent
branches into two method viz GetDynamicFileContentOther
or GetDynamicFileContentUTF
depending on the encoding.
'********************************************************************* ' ' GetDynamicFileContent Method ' ' File Content is picked in this method ' '********************************************************************* Private Shared Sub GetDynamicFileContent(ByVal srchFile As Searchs.Page) Dim wcMicrosoft As System.Net.WebClient If Searchs.Site.Encoding.Equals("utf-8") Then GetDynamicFileContentUTF(srchFile) Else GetDynamicFileContentOther(srchFile) End If End Sub
System.Net.WebClient
provides common methods for sending data
to and receiving data from a resource identified by a URI. We make use of the DownloadData
which downloads data from a resource and returns a byte array.
Applications that target the common language runtime use encoding to map character representations from the native character scheme (Unicode) to other schemes. Applications use decoding to map characters from nonnative schemes (non-Unicode) to the native scheme. The System.Text Namespace provides classes that allow you to encode and decode characters.
'********************************************************************* ' ' GetDynamicFileContentOther Method ' ' File Content is picked in this method according to the encoding provided ' '********************************************************************* Private Shared Sub GetDynamicFileContentOther(ByVal srchFile As Searchs.Page) Dim wcMicrosoft As System.Net.WebClient Dim fileEncoding As System.Text.Encoding Try fileEncoding = System.Text.Encoding.GetEncoding(Searchs.Site.Encoding) srchFile.Contents = fileEncoding.GetString( _ wcMicrosoft.DownloadData(String.Format("{0}/{1}", Searchs.Site.ApplicationPath, srchFile.Path))) Catch ex As System.Net.WebException m_page.Trace.Warn("Error", ex.Message) srchFile.Contents = ex.Message Catch ex As System.Exception m_page.Trace.Warn("Error", ex.Message) srchFile.Contents = ex.Message End Try End Sub
UTF8Encoding
class encodes Unicode characters using UCS Transformation Format, 8-bit form (UTF-8). This encoding supports all Unicode character values and surrogates.
'********************************************************************* ' ' GetDynamicFileContentUTF Method ' ' File Content is picked in this method according to the utf-8 encoding ' '********************************************************************* Private Shared Sub GetDynamicFileContentUTF(ByVal srchFile As Searchs.Page) Dim wcMicrosoft As System.Net.WebClient Dim objUTF8Encoding As UTF8Encoding Try wcMicrosoft = New System.Net.WebClient() objUTF8Encoding = New UTF8Encoding() srchFile.Contents = objUTF8Encoding.GetString( _ wcMicrosoft.DownloadData(String.Format("{0}/{1}", Searchs.Site.ApplicationPath, srchFile.Path))) Catch ex As System.Net.WebException m_page.Trace.Warn("Error", ex.Message) srchFile.Contents = ex.Message Catch ex As System.Exception m_page.Trace.Warn("Error", ex.Message) srchFile.Contents = ex.Message End Try End Sub
GetFilePath Method
The GetFilePath
method coverts local folder path to reflect the URL of the site.
'*****************************************
'
' GetFilePath Method
'
' File path is modfied to be displayed
' as hyperlink in this method
'
'*****************************************
Private Shared Sub GetFilePath(ByVal strFileURL As String,_
ByVal srchFile As Searchs.Page)
'Turn the server path to the file into a URL path to the file
strFileURL = Replace(strFileURL, m_page.Server.MapPath("./"), "")
'Replace the NT backslash with the internet
'forward slash in the URL to the file
strFileURL = Replace(strFileURL, "\", "/")
'Encode the file name and path into the URL code method
strFileURL = m_page.Server.UrlEncode(strFileURL)
'Just incase it's encoded any backslashes
strFileURL = Replace(strFileURL.Trim(), _
"%2f", "/", vbTextCompare)
srchFile.Path = strFileURL
m_page.Trace.Warn("Url", srchFile.Path)
End Sub
GetMetaContent Method
GetMetaContent
method uses regular expressions to strip the tags and get the required information.
'************************************************
'
' GetMetaContent Method
'
' Metacontent is stripped in this method
'
'************************************************
Private Shared Function GetMetaContent(ByVal strFile As String, _
ByVal strMetaStart As String, ByVal strMetaEnd As String) As String
'List the text between the title tags:
Dim regexp As Regex
Dim strMeta As String
Dim strPattern As String
Dim strInPattern As String
'If no description or keywords are found then you may be
'using http-equiv= instead of name= in your meta tags
If InStr(1, LCase(strFile), strMetaStart, 1) = 0 _
And InStr(strMetaStart, "name=") Then
'Swap name= for http-equiv=
strMetaStart = Replace(strMetaStart, "name=", "http-equiv=")
End If
'Build Pattern
strInPattern = "((.|\n)*?)"
strPattern = String.Format("{0}{1}{2}", _
strMetaStart, strInPattern, strMetaEnd)
regexp = New Regex(strPattern, RegexOptions.IgnoreCase)
'Match Pattern
strMeta = regexp.Match(strFile).ToString
'Build Pattern
strInPattern = "(.*?)"
strPattern = String.Format("{0}{1}{2}", _
strMetaStart, strInPattern, strMetaEnd)
'Get Pattern content
strMeta = regexp.Replace(strMeta, strPattern,_
"$1", RegexOptions.IgnoreCase)
Return strMeta
End Function
Comments