How to Build a Multilingual OCR AI Agent in Python with EasyOCR and OpenCV

In this tutorial, we construct an Advanced OCR AI Agent in Google Colab utilizing EasyOCR, OpenCV, and Pillow, operating absolutely offline with GPU acceleration. The agent consists of a preprocessing pipeline with distinction enhancement (CLAHE), denoising, sharpening, and adaptive thresholding to enhance recognition accuracy. Beyond fundamental OCR, we filter outcomes by confidence, generate textual content statistics, and carry out sample detection (emails, URLs, dates, cellphone numbers) alongside with easy language hints. The design additionally helps batch processing, visualization with bounding bins, and structured exports for versatile utilization. Check out the FULL CODES here.
!pip set up easyocr opencv-python pillow matplotlib
import easyocr
import cv2
import numpy as np
from PIL import Image, ImageImprove, ImageFilter
import matplotlib.pyplot as plt
import os
import json
from typing import List, Dict, Tuple, Optional
import re
from google.colab import information
import io
We begin by putting in the required libraries, EasyOCR, OpenCV, Pillow, and Matplotlib, to arrange our surroundings. We then import all crucial modules so we will deal with picture preprocessing, OCR, visualization, and file operations seamlessly. Check out the FULL CODES here.
class AdvancedOCRAgent:
"""
Advanced OCR AI Agent with preprocessing, multi-language help,
and clever textual content extraction capabilities.
"""
def __init__(self, languages: List[str] = ['en'], gpu: bool = True):
"""Initialize OCR agent with specified languages."""
print("
Initializing Advanced OCR Agent...")
self.languages = languages
self.reader = easyocr.Reader(languages, gpu=gpu)
self.confidence_threshold = 0.5
print(f"
OCR Agent prepared! Languages: {languages}")
def upload_image(self) -> Optional[str]:
"""Upload picture file via Colab interface."""
print("
Upload your picture file:")
uploaded = information.add()
if uploaded:
filename = checklist(uploaded.keys())[0]
print(f"
Uploaded: {filename}")
return filename
return None
def preprocess_image(self, picture: np.ndarray, improve: bool = True) -> np.ndarray:
"""Advanced picture preprocessing for higher OCR accuracy."""
if len(picture.form) == 3:
grey = cv2.cvtColor(picture, cv2.COLOR_BGR2GRAY)
else:
grey = picture.copy()
if improve:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
grey = clahe.apply(grey)
grey = cv2.fastNlMeansDenoising(grey)
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
grey = cv2.filter2D(grey, -1, kernel)
binary = cv2.adaptiveThreshold(
grey, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return binary
def extract_text(self, image_path: str, preprocess: bool = True) -> Dict:
"""Extract textual content from picture with superior processing."""
print(f"
Processing picture: {image_path}")
picture = cv2.imread(image_path)
if picture is None:
increase WorthError(f"Could not load picture: {image_path}")
if preprocess:
processed_image = self.preprocess_image(picture)
else:
processed_image = picture
outcomes = self.reader.readtext(processed_image)
extracted_data = {
'raw_results': outcomes,
'filtered_results': [],
'full_text': '',
'confidence_stats': {},
'word_count': 0,
'line_count': 0
}
high_confidence_text = []
confidences = []
for (bbox, textual content, confidence) in outcomes:
if confidence >= self.confidence_threshold:
extracted_data['filtered_results'].append({
'textual content': textual content,
'confidence': confidence,
'bbox': bbox
})
high_confidence_text.append(textual content)
confidences.append(confidence)
extracted_data['full_text'] = ' '.be a part of(high_confidence_text)
extracted_data['word_count'] = len(extracted_data['full_text'].cut up())
extracted_data['line_count'] = len(high_confidence_text)
if confidences:
extracted_data['confidence_stats'] = {
'imply': np.imply(confidences),
'min': np.min(confidences),
'max': np.max(confidences),
'std': np.std(confidences)
}
return extracted_data
def visualize_results(self, image_path: str, outcomes: Dict, show_bbox: bool = True):
"""Visualize OCR outcomes with bounding bins."""
picture = cv2.imread(image_path)
image_rgb = cv2.cvtColor(picture, cv2.COLOR_BGR2RGB)
plt.determine(figsize=(15, 10))
if show_bbox:
plt.subplot(2, 2, 1)
img_with_boxes = image_rgb.copy()
for merchandise in outcomes['filtered_results']:
bbox = np.array(merchandise['bbox']).astype(int)
cv2.polylines(img_with_boxes, [bbox], True, (255, 0, 0), 2)
x, y = bbox[0]
cv2.putText(img_with_boxes, f"{merchandise['confidence']:.2f}",
(x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
plt.imshow(img_with_boxes)
plt.title("OCR Results with Bounding Boxes")
plt.axis('off')
plt.subplot(2, 2, 2)
processed = self.preprocess_image(picture)
plt.imshow(processed, cmap='grey')
plt.title("Preprocessed Image")
plt.axis('off')
plt.subplot(2, 2, 3)
confidences = [item['confidence'] for merchandise in outcomes['filtered_results']]
if confidences:
plt.hist(confidences, bins=20, alpha=0.7, coloration='blue')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.title('Confidence Score Distribution')
plt.axvline(self.confidence_threshold, coloration='crimson', linestyle='--',
label=f'Threshold: {self.confidence_threshold}')
plt.legend()
plt.subplot(2, 2, 4)
stats = outcomes['confidence_stats']
if stats:
labels = ['Mean', 'Min', 'Max']
values = [stats['mean'], stats['min'], stats['max']]
plt.bar(labels, values, coloration=['green', 'red', 'blue'])
plt.ylabel('Confidence Score')
plt.title('Confidence Statistics')
plt.ylim(0, 1)
plt.tight_layout()
plt.present()
def smart_text_analysis(self, textual content: str) -> Dict:
"""Perform clever evaluation of extracted textual content."""
evaluation = {
'language_detection': 'unknown',
'text_type': 'unknown',
'key_info': {},
'patterns': []
}
email_pattern = r'b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+.[A-Z|a-z]{2,}b'
phone_pattern = r'(+d{1,3}[-.s]?)?(?d{3})?[-.s]?d{3}[-.s]?d{4}'
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
date_pattern = r'bd{1,2}[/-]d{1,2}[/-]d{2,4}b'
patterns = {
'emails': re.findall(email_pattern, textual content, re.IGNORECASE),
'telephones': re.findall(phone_pattern, textual content),
'urls': re.findall(url_pattern, textual content, re.IGNORECASE),
'dates': re.findall(date_pattern, textual content)
}
evaluation['patterns'] = {ok: v for ok, v in patterns.objects() if v}
if any(patterns.values()):
if patterns.get('emails') or patterns.get('telephones'):
evaluation['text_type'] = 'contact_info'
elif patterns.get('urls'):
evaluation['text_type'] = 'web_content'
elif patterns.get('dates'):
evaluation['text_type'] = 'document_with_dates'
if re.search(r'[а-яё]', textual content.decrease()):
evaluation['language_detection'] = 'russian'
elif re.search(r'[àáâãäåæçèéêëìíîïñòóôõöøùúûüý]', textual content.decrease()):
evaluation['language_detection'] = 'romance_language'
elif re.search(r'[一-龯]', textual content):
evaluation['language_detection'] = 'chinese language'
elif re.search(r'[ひらがなカタカナ]', textual content):
evaluation['language_detection'] = 'japanese'
elif re.search(r'[a-zA-Z]', textual content):
evaluation['language_detection'] = 'latin_based'
return evaluation
def process_batch(self, image_folder: str) -> List[Dict]:
"""Process a number of photos in batch."""
outcomes = []
supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')
for filename in os.listdir(image_folder):
if filename.decrease().endswith(supported_formats):
image_path = os.path.be a part of(image_folder, filename)
strive:
consequence = self.extract_text(image_path)
consequence['filename'] = filename
outcomes.append(consequence)
print(f"
Processed: {filename}")
besides Exception as e:
print(f"
Error processing {filename}: {str(e)}")
return outcomes
def export_results(self, outcomes: Dict, format: str = 'json') -> str:
"""Export outcomes in specified format."""
if format.decrease() == 'json':
output = json.dumps(outcomes, indent=2, ensure_ascii=False)
filename = 'ocr_results.json'
elif format.decrease() == 'txt':
output = outcomes['full_text']
filename = 'extracted_text.txt'
else:
increase WorthError("Supported codecs: 'json', 'txt'")
with open(filename, 'w', encoding='utf-8') as f:
f.write(output)
print(f"
Results exported to: {filename}")
return filename
We outline an AdvancedOCRAgent that we initialize with multilingual EasyOCR and a GPU, and we set a confidence threshold to management output high quality. We preprocess photos (CLAHE, denoise, sharpen, adaptive threshold), extract textual content, visualize bounding bins and confidence, run sensible sample/language evaluation, help batch folders, and export outcomes as JSON or TXT. Check out the FULL CODES here.
def demo_ocr_agent():
"""Demonstrate the OCR agent capabilities."""
print("
Advanced OCR AI Agent Demo")
print("=" * 50)
ocr = AdvancedOCRAgent(languages=['en'], gpu=True)
image_path = ocr.upload_image()
if image_path:
strive:
outcomes = ocr.extract_text(image_path, preprocess=True)
print("n
OCR Results:")
print(f"Words detected: {outcomes['word_count']}")
print(f"Lines detected: {outcomes['line_count']}")
print(f"Average confidence: {outcomes['confidence_stats'].get('imply', 0):.2f}")
print("n
Extracted Text:")
print("-" * 30)
print(outcomes['full_text'])
print("-" * 30)
evaluation = ocr.smart_text_analysis(outcomes['full_text'])
print(f"n
Smart Analysis:")
print(f"Detected textual content kind: {evaluation['text_type']}")
print(f"Language hints: {evaluation['language_detection']}")
if evaluation['patterns']:
print(f"Found patterns: {checklist(evaluation['patterns'].keys())}")
ocr.visualize_results(image_path, outcomes)
ocr.export_results(outcomes, 'json')
besides Exception as e:
print(f"
Error: {str(e)}")
else:
print("No picture uploaded. Please strive once more.")
if __name__ == "__main__":
demo_ocr_agent()
We create a demo operate that walks us via the complete OCR workflow: we initialize the agent with English and GPU help, add a picture, preprocess it, and extract textual content with confidence stats. We then show the outcomes, run sensible textual content evaluation to detect patterns and language hints, visualize bounding bins and scores, and lastly export the whole lot into a JSON file.
In conclusion, we create a strong OCR pipeline that mixes preprocessing, recognition, and evaluation in a single Colab workflow. We improve EasyOCR outputs utilizing OpenCV methods, visualize outcomes for interpretability, and add confidence metrics for reliability. The agent is modular, permitting each single-image and batch processing, with outcomes exported in JSON or textual content codecs. This exhibits that open-source instruments can ship production-grade OCR with out exterior APIs, whereas leaving room for domain-specific extensions like bill or doc parsing.
Check out the FULL CODES here. Feel free to try our GitHub Page for Tutorials, Codes and Notebooks. Also, be happy to observe us on Twitter and don’t neglect to be a part of our 100k+ ML SubReddit and Subscribe to our Newsletter.
The submit How to Build a Multilingual OCR AI Agent in Python with EasyOCR and OpenCV appeared first on MarkTechPost.