OCR.NET for Android 1.0
OCR.NET for Android 1.0 | 16 Mb
Optical character recognition library for Android. Use OCR.NET for Android to retrieve text from images.
uses Tesseract OCR engine and Leptonica image processing library
supports Android armeabi-v7a devices
source code included in full version
royalty free distribution in applications
Constructors
public Ocr(Activity activity);
Initializes a new instance of the Ocr class.
Properties
public bool Active {get; set;}
Set Active property to true to initialize Tesseract library.
public string[] AvailableLanguages {get;}
Returns the available languages.
public int Confidence {get;}
Average word confidence for Tesseract page result.
public Rectangle[] ConnectedComponents {get;}
Returns the location of the selected individual connected (text) components.
public string DataPath {get; set;}
Path to tessdata folder containing Tesseract language data.
public float DeskewAngle {get;}
Deskew angle.
public EngineMode EngineMode {get; set;}
public enum EngineMode
{
TesseractOnly, // use Tesseract engine only (the fastest)
CubeOnly, // use Cube engine only (better accuracy)
Combined, // use both engines combined (the best accuracy)
Default // use default engine, it is Tesseract engine only
}
Chooses OCR engine.
public IntPtr Handle {get;}
Handle of Tesseract engine.
public static string ImagelibVersions {get;}
Returns versions of image libraries used by Leptonica.
public string InitLanguages {get;}
Returns the languages string used in the last valid initialization. Languages loaded as dependencies of other loaded languages aren't included in this property.
public Language Language {get; set;}
public enum Language
{
Default, Custom,
Arabic, Bangla, Bulgarian, Catalan, Cherokee, ChineseSimplified, ChineseTraditional, Czech, Danish, Dutch, English, Finnish, French, German, Greek, Hebrew, Hindi, Hungarian, Indonesian, Italian, Japanese, Korean, Latvian, Lithuanian, Norwegian, Polish, Portuguese, Quechua, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swedish, Tagalog, Thai, Turkish, Ukrainian, Vietnamese
}
Language used by Tesseract engine. Language data are available at http://code.google.com/p/tesseract-ocr/source/checkout.
public string LanguageCode {get; set;}
Custom language code or multiple language codes used by Tesseract engine. To specify multiple languages, separate language codes by '+' character: eng+slk+rus.
public string[] LoadedLanguages {get;}
Returns the loaded languages. Includes all languages loaded by the last Tesseract initialization, including those loaded as dependencies of other loaded languages.
public int PageCount {get;}
Returns number of pages in multipage TIFF image.
public int PageNumber {get; set;}
Page number in multipage TIFF image, that will be used for OCR.
public PageOrientation PageOrientation {get;}
public enum PageOrientation { Up, Right, Down, Left }
Page orientation.
public PageSegmentation PageSegmentation {get; set;}
public enum PageSegmentation
{
OSDOnly, // orientation and script detection only
AutoOSD, // automatic page segmentation with orientation and script detection (OSD)
AutoOnly, // automatic page segmentation, but no OSD nor OCR
Auto, // automatic page segmentation, but no OSD
SingleColumn, // assume a single column of text of variable sizes
SingleVerticalBlock, // assume a single uniform block of vertically aligned text
SingleBlock, // assume a single uniform block of text
SingleLine, // treat the image as a single text line
SingleWord, // treat the image as a single word
CircleWord, // treat the image as a single word in a circle
SingleChar, // treat the image as a single character
SparseText, // find as much text as possible in no particular order
SparseTextOSD // sparse text with orientation and script detection (OSD)
}
Selects page layout analysis mode.
public Paragraph[] Paragraphs {get;}
public class Paragraph
{
public Rectangle Location {get;} // left, top, right and bottom coordinates
public string Text {get;} // recognized text
public float Confidence {get;} // mean confidence interpreted as a percent probability (0.0 - 100.0)
public int FirstLineIndent {get;} // intent of the first line
public float DeskewAngle {get;} // deskew angle
public ParagraphJustification Justification {get;} // text justification
public PageOrientation Orientation {get;} // orientation
public WritingDirection WritingDirection {get;} // writing direction
public TextLineOrder TextLineOrder {get;} // line order
public bool IsListItem {get;} // list item
public bool IsCrown {get;} // crown
public bool IsLeftToRight {get;} // LTR direction
}
public enum ParagraphJustification { Unknown, Left, Center, Right }
Provides detailed information of the selected paragraph.
public Image Picture {get; set;}
Picture used for OCR.
public string PictureFileName {get; set;}
Picture file used for OCR. The picture file is not loaded by component, but directly used by image libraries. GIF, JPEG, PNG and TIFF image formats are supported.
public int PictureHeight {get; set;}
Height of the part of picture used for OCR.
public int PictureLeft {get; set;}
Left origin of the part of picture used for OCR.
public int PictureTop {get; set;}
Top origin of the part of picture used for OCR.
public int PictureWidth {get; set;}
Width of the part of picture used for OCR.
public Rectangle[] Regions {get;}
Returns the location of regions (result of page layout analysis).
public Region[] RegionDetails {get;}
public class Region
{
public Rectangle Location {get;} // left, top, right and bottom coordinates
public string Text {get;} // recognized text
public float Confidence {get;} // mean confidence interpreted as a percent probability (0.0 - 100.0)
public float DeskewAngle {get;} // deskew angle
public PageOrientation Orientation {get;} // orientation
public WritingDirection WritingDirection {get;} // writing direction
public TextLineOrder TextLineOrder {get;} // line order
}
Provides detailed information of the selected region.
public int Resolution {get; set;}
Resolution of the source image in pixels per inch, so font size information can be calculated in results.
public Rectangle[] Strips {get;}
Returns the location of selected textline or strip of image regions.
public Symbol[] Symbols {get;}
public class Symbol
{
public Rectangle Location {get;} // left, top, right and bottom coordinates
public string Text {get;} // recognized text
public float Confidence {get;} // mean confidence interpreted as a percent probability (0.0 - 100.0)
}
Provides detailed information of the selected symbol.
public string Text {get;}
Recognized text.
public TextLineOrder TextLineOrder {get;}
public enum TextLineOrder { LeftToRight, RightToLeft, TopToBottom }
Text lines are read in the given sequence.
public Rectangle[] TextLines {get;}
Returns the location of textlines.
public TextLine[] TextLineDetails {get;}
public class TextLine
{
public Rectangle Location {get;} // left, top, right and bottom coordinates
public string Text {get;} // recognized text
public float Confidence {get;} // mean confidence interpreted as a percent probability (0.0 - 100.0)
public float DeskewAngle {get;} // deskew angle
public PageOrientation Orientation {get;} // orientation
public WritingDirection WritingDirection {get;} // writing direction
public TextLineOrder TextLineOrder {get;} // line order
}
Provides detailed information of the selected textline.
public byte[] Unlv {get;}
Recognized text in UTF-8 UNLV format.
public string UnlvText {get;}
Recognized text in UNLV format.
public byte[] Utf8Text {get;}
Recognized UTF8 text.
public static string Version {get;}
Returns version of Tesseract OCR engine.
public WordDetail[] WordDetails {get;}
public class WordDetail
{
public Rectangle Location { get; private set; } // left, top, right and bottom coordinates
public string Text { get; private set; } // recognized text
public float Confidence { get; private set; } // mean confidence interpreted as a percent probability (0.0 - 100.0)
public WordFlag Flags { get; private set; } // flags
public string FontName { get; private set; } // font name
public int PointSize { get; private set; } // font size in printer points (1/72 inch)
}
[Flags]
public enum WordFlag
{
Bold, // word is bold
Italic, // word is italic
Underlined, // word is underlined
Monospace, // word uses monospace font
Serif, // word uses serif font
SmallCaps, // word uses small caps
DropCap, // word is dropcap
Subscript, // word is subscript
Superscript, // word is superscript
Numeric, // word is numeric
FromDictionary // word was found in dictionary
}
Provides detailed information of the recognized word. Returns the location, text, confidence and other information of the selected word.
public Rectangle[] Words {get;}
Returns the location of recognized words.
public WritingDirection WritingDirection {get;}
public enum WritingDirection { LeftToRight, RightToLeft, TopToBottom }
Writing direction.
Methods
public bool AdaptToWord(PageSegmentation pageSegmentation, string word);
Applies the given word to the adaptive classifier. See Tesseract documentation for further information.
public void AddInitParameter(string name, string value);
Add variable used for Tesseract initialization.
public void ClearAdaptiveClassifier();
Call this function between pages to free up memory and forget adaptive data.
public void ClearInitParameters();
Clears variables used for Tesseract initialization.
public void ClearPersistentCache();
Clears any library-level memory caches. There are a variety of expensive-to-load constant data structures (mostly language dictionaries) in Tesseract that are cached globally. This procedure allows the clearing of these caches.
public bool GetBoolParameter(string name);
Get Tesseract boolean variable.
public string GetBoxText();
Recognized text with box information. Each line consists of text, left position, top position, right position, bottom position and page number separated by spaces.
public double GetDoubleParameter(string name);
Get Tesseract double variable.
public string GetHtmlText();
Recognized text in HTML format.
public int GetIntParameter(string name);
Get Tesseract integer variable.
public string GetStringParameter(string name);
Get Tesseract string variable.
public bool GetTextDirection(out int offset, out float slope);
Retrieves offset and slope of lines. See Tesseract documentation for further information.
public string GetTsvText();
Recognized text in tab-separated values (TSV) format.
public string GetUniChar(int id);
Returns the string form of the specified unichar.
public bool IsValidCharacter(char character);
Returns true if the specified character is defined in the UniCharset.
public bool IsValidWord(string word);
Check whether a word is valid according to Tesseract's language model.
public bool PrintVariables(string fileName);
Print Tesseract variables to the specified file.
public void ReadConfigFile(string fileName);
Read a config file containing a set of parameter name-value pairs.
public void Recognize();
Initiate OCR recognision.
public void SetBoolParameter(string name, bool value);
Set Tesseract boolean variable.
public void SetDoubleParameter(string name, double value);
Set Tesseract double variable.
public void SetIntParameter(string name, int value);
Set Tesseract integer variable.
public void SetStringParameter(string name, string value);
Set Tesseract string variable.
Events
public event ProgressEventHandler Progress;
Progress notification
public delegate void ProgressEventHandler(object sender, ProgressEventArgs e);
public class ProgressEventArgs : EventArgs
public bool Cancel { get; set; }
Request to cancel recognition process.
public int Progress { get; }
Percent complete. Value range is 0 to 100.
public int WordCount { get; }
Returns number of found words.
Restriction: If you have LIETIME V.I.P membership please mail us for password.
Only for V.I.P
Warning! You are not allowed to view this text.