Python Scripts
Python Scripts
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git
#Run
%%capture
%cd /content/IndicTrans2/huggingface_interface
#Run
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2
mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece
#Run
!pip install pdfplumber
#Run
!pip install python-docx
##Resart session
# Run
import pdfplumber
from docx import Document
from docx.shared import Pt, Inches
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer
#Run
def process_pdf(path):
with pdfplumber.open(path) as pdf:
text_ = []
table = []
for page in pdf.pages:
text = page.extract_text()
if text:
text_.append(text)
for table in page.extract_tables():
table_summaries.append({"type": "table", "content": table})
tokenizer = IndicTransTokenizer(direction=direction)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)
if qconfig == None:
model = model.to(DEVICE)
if DEVICE == "cuda":
model.half()
model.eval()
del inputs
torch.cuda.empty_cache()
return translations
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
def main_fuc(path,filename):
text = process_pdf(path)
ip = IndicProcessor(inference=True)
main_text = ''
for i in text:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" #
ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model =
initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)
ip = IndicProcessor(inference=True)
en_sents = [i]
hi_translations = batch_translate(en_sents, src_lang, tgt_lang,
en_indic_model, en_indic_tokenizer, ip)
main_text += hi_translations[0]
# Add a paragraph
paragraph = doc.add_paragraph(str(main_text))
# 1. Filepath 2. Filename
Filepath = "/content/2003_S1_278_306.pdf"
file = "2003_S1_278_306"
#Run
main_fuc(Filepath,"file")