0% found this document useful (1 vote)
134 views

Source Code

This document contains code for a program that can read text from various file types including PDFs, images, and text files. It defines functions for opening files, extracting text using iTextSharp for PDFs, MODI for images, and Tesseract for OCR. The text is displayed in a text box. Buttons trigger the different file reading methods based on the file extension. It also contains functions for converting images to base64 strings and reading files into memory streams.

Uploaded by

info.glcom5161
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (1 vote)
134 views

Source Code

This document contains code for a program that can read text from various file types including PDFs, images, and text files. It defines functions for opening files, extracting text using iTextSharp for PDFs, MODI for images, and Tesseract for OCR. The text is displayed in a text box. Buttons trigger the different file reading methods based on the file extension. It also contains functions for converting images to base64 strings and reading files into memory streams.

Uploaded by

info.glcom5161
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

Imports

Imports
Imports
Imports
Imports
Imports
Imports
Imports

System
System.IO
System.Text
iTextSharp.text
System.Drawing.Imaging
SD = System.Drawing
VB = Microsoft.VisualBasic
System.Windows.Resources

Imports
Imports
Imports
Imports
Imports

System.Collections.Generic
System.Runtime.Serialization
System.Runtime.Serialization.Json
System.Threading
TS = tessnet2

Imports MODI
Public Class frmReadPDF
Dim sFileName, sFileExtn As String
Public cxBasePath, cxTempPath, cxDesktopPath, cxDesktopLog, cxMyDocPath As S
tring
Private DateTime = &H132 '306
Private Sub frmReadPDF_Load(ByVal sender As Object, ByVal e As System.EventA
rgs) Handles Me.Load
cxDesktopPath = System.Environment.GetFolderPath(Environment.SpecialFold
er.Desktop)
cxBasePath = System.Environment.CurrentDirectory
End Sub
Private Sub btnPickFile_Click(ByVal sender As System.Object, ByVal e As Syst
em.EventArgs) Handles btnPickFile.Click
Dim ofd As OpenFileDialog
rtbPdfText.Text = ""
Try
ofd = New OpenFileDialog
ofd.Title = "Please Select a PDF/Image file"
ofd.InitialDirectory = cxBasePath
ofd.Filter = "PDF & Image files (*.pdf, *.jpg, *.jpeg, *.gif, *.tif,
*.png) | *.pdf; *.jpg; *.jpeg; *.gif; *.tif; *.png"
If ofd.ShowDialog = Windows.Forms.DialogResult.OK Then
Dim fi As FileInfo = New FileInfo(ofd.FileName)
If fi.Length > 2000000 Then
'--// Limit file size upto 2MB
MsgBox("Please choose a file of max size upto 2MB !", MsgBox
Style.Exclamation + MsgBoxStyle.OkOnly, Me.Text)
Exit Sub
End If
txtFileName.Text = ofd.FileName
lblFileExtn.Text = VB.LCase(System.IO.Path.GetExtension(ofd.File
Name))
sFileExtn = Replace(lblFileExtn.Text, ".", String.Empty)
Select Case sFileExtn
Case "pdf"
btnReadPdf.Enabled = True
btnReadFSM.Enabled = True
btnReadMDI.Enabled = False

btnReadImg.Enabled = False
btnReadTss.Enabled = False
Case "bmp", "tif", "gif", "jpg", "png", "jpeg", "tiff"
btnReadPdf.Enabled = False
btnReadFSM.Enabled = False
btnReadMDI.Enabled = True
btnReadImg.Enabled = True
btnReadTss.Enabled = True
End Select
End If
Catch ex As Exception
MsgBox(ex.Message)
Finally
ofd = Nothing
End Try
End Sub
Private Sub btnReadMDI_Click(ByVal sender As System.Object, ByVal e As Syste
m.EventArgs) Handles btnReadMDI.Click
Dim out As String = ""
Dim md As New MODI.Document()
Try
md.Create(txtFileName.Text)
md.OCR(MODI.MiLANGUAGES.miLANG_ENGLISH, True, True)
Dim image As MODI.Image = DirectCast(md.Images(0), MODI.Image)
Dim layout As MODI.Layout = image.Layout
For j As Integer = 0 To layout.Words.Count - 1
Dim word As MODI.Word = DirectCast(layout.Words(j), MODI.Word)
out += " " & word.Text
Next
rtbPdfText.Text = out
Catch ex As Exception
MsgBox(ex.Message)
Finally
md = Nothing
End Try
End Sub
Private Sub btnReadPdf_Click(ByVal sender As System.Object, ByVal e As Syste
m.EventArgs) Handles btnReadPdf.Click
Try
Dim sb As String = GetTextFromPDF(txtFileName.Text)
rtbPdfText.Text = sb.ToString
Catch ex As Exception
MsgBox(ex.Message)
End Try
End Sub
Public Function GetTextFromPDF(ByVal PdfFileName As String) As String
Dim oReader As New iTextSharp.text.pdf.PdfReader(PdfFileName)
Dim sOut = ""
For i = 1 To oReader.NumberOfPages
Dim its As New iTextSharp.text.pdf.parser.SimpleTextExtractionStrate
gy
sOut &= iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(
oReader, i, its)

Next
Return sOut
End Function
Private Sub btnReadImg_Click(ByVal sender As System.Object, ByVal e As Syste
m.EventArgs) Handles btnReadImg.Click
Dim sTmp As String = ""
Dim validExtns() As String = {"bmp", "tif", "gif", "jpg", "png", "jpeg",
"tiff"}
Dim result As String() = Array.FindAll(validExtns, Function(s) s.Equals(
sFileExtn))
If (result.Length > 0) Then
Dim img As Bitmap = New Bitmap(txtFileName.Text)
Dim str As String = ConvertImageToBase64String(img)
rtbPdfText.Text = str
End Sub
Public Shared Function OCRImage(ByVal bm As System.Drawing.Image, ByVal lang
uage As String, ByVal path As String) As String
OCRImage = ""
Dim oOCR As New tessnet2.Tesseract
Try
oOCR.Init(path, language, False)
Dim WordList As New List(Of tessnet2.Word)
WordList = oOCR.doOCR(bm, SD.Rectangle.Empty)
Dim LineCount As Integer = tessnet2.Tesseract.LineCount(WordList)
For i As Integer = 0 To LineCount - 1
OCRImage &= tessnet2.Tesseract.GetLineText(WordList, i) & vbCrLf
Next
Catch ex As Exception
MsgBox(ex.Message)
Finally
oOCR.Dispose()
End Try
End Function
Private Sub btnReadTss_Click(ByVal sender As System.Object, ByVal e As Syste
m.EventArgs) Handles btnReadTss.Click
'Dim img As Bitmap = New Bitmap(txtFileName.Text)
'Dim gfx As Graphics = Graphics.FromImage(img)
'gfx.CopyFromScreen(New Point(Me.Location.X + PictureBox1.Location.X + 4
, Me.Location.Y + PictureBox1.Location.Y + 30), New Point(0, 0), img.Size)
'PictureBox1.Image = img
Dim str2 As String = ReadImageTextUsingTESS()
rtbPdfText.Text = str2
End Sub
Public Function ReadImageTextUsingTESS() As String
Dim tsDataPath As String = cxBasePath & "\tessdata"
Dim str As String = ""
Dim ocr As New TS.Tesseract
Try
Dim img As Bitmap = New Bitmap(txtFileName.Text)

'ocr.Init(Nothing, "eng", False)


'ocr.SetVariable("tessedit_char_whitelist", "0123456790")
ocr.Init(tsDataPath, "eng", False)
'To
Dim
Dim
For

use correct tessdata


result As List(Of TS.Word) = ocr.doOCR(img, SD.Rectangle.Empty)
iLines = TS.Tesseract.LineCount(result)
i As Integer = 0 To iLines - 1
str &= TS.Tesseract.GetLineText(result, i) & vbCrLf

Next
''--// WORKING; STOPPED
''For Each word As tessnet2.Word In result
''
str &= word.Text & " "
''
Console.WriteLine("{0} : {1}", word.Text, word.Text)
''Next
'Dim img2 As System.Drawing.Image = Image.FromFile(txtFileName.Text)
'Dim WordList As New List(Of tessnet2.Word)
'WordList = ocr.doOCR(img2, System.Drawing.Rectangle.Empty)
'Dim LineCount As Integer = tessnet2.Tesseract.LineCount(WordList)
'For i As Integer = 0 To LineCount - 1
'
str &= tessnet2.Tesseract.GetLineText(WordList, i) & vbCrLf
'Next
Catch ex As Exception
MsgBox(ex.Message)
Finally
If ocr IsNot Nothing Then ocr.Dispose()
ocr = Nothing
End Try
Return str
End Function
Public Function ConvertImageToBase64String(ByVal value As System.Drawing.Ima
ge) As String
If value Is Nothing Then Return ""
Dim szResult As String = ""
Using ms As New MemoryStream
value.Save(ms, ImageFormat.Jpeg)
ms.Flush()
ms.Position = 0
Dim buffer = ms.ToArray
'szResult = System.Text.UnicodeEncoding.ASCII.GetString(buffer)
szResult = Convert.ToBase64String(buffer)
End Using
Return szResult
End Function
Private Sub btnReadFSM_Click(ByVal sender As System.Object, ByVal e As Syste
m.EventArgs) Handles btnReadFSM.Click
Dim bteRead() As Byte = Nothing
Dim lngRead As Long = 0
Dim strRead As String
Try
Using fs As FileStream = File.OpenRead(txtFileName.Text)
Dim ms As New MemoryStream()
ms.SetLength(fs.Length)

lngRead = fs.Length
ReDim bteRead(lngRead)
fs.Read(bteRead, 0, lngRead)
strRead = Encoding.ASCII.GetString(bteRead)
'Dim buffer = ms.ToArray
'strRead = Convert.ToBase64String(buffer, Base64FormattingOption
s.InsertLineBreaks)
rtbPdfText.Text = strRead
End Using
Catch ex As Exception
MsgBox(ex.Message)
End Try
End Sub
Private Sub btnClose_Click(ByVal sender As System.Object, ByVal e As System.
EventArgs) Handles btnClose.Click
GC.Collect()
Me.Close()
End Sub
End Class

You might also like