0% found this document useful (0 votes)
39 views

Python Scripts

Contains python script for translation.

Uploaded by

Pragit Sharma
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
39 views

Python Scripts

Contains python script for translation.

Uploaded by

Pragit Sharma
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

#Run

%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

#Run
%%capture
%cd /content/IndicTrans2/huggingface_interface

#Run
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2
mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransTokenizer


%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

#Run
!pip install pdfplumber

Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-


packages (0.11.0)
Requirement already satisfied: pdfminer.six==20231228 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (20231228)
Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.10/dist-
packages (from pdfplumber) (9.4.0)
Requirement already satisfied: pypdfium2>=4.18.0 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (4.30.0)
Requirement already satisfied: charset-normalizer>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from pdfminer.six==20231228->pdfplumber)
(3.3.2)
Requirement already satisfied: cryptography>=36.0.0 in
/usr/local/lib/python3.10/dist-packages (from pdfminer.six==20231228->pdfplumber)
(42.0.7)
Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-
packages (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber) (1.16.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-
packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228-
>pdfplumber) (2.22)

#Run
!pip install python-docx

Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-


packages (1.1.2)
Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-
packages (from python-docx) (4.9.4)
Requirement already satisfied: typing-extensions>=4.9.0 in
/usr/local/lib/python3.10/dist-packages (from python-docx) (4.12.1)

##Resart session

# Run
import pdfplumber
from docx import Document
from docx.shared import Pt, Inches
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

#Run
def process_pdf(path):
with pdfplumber.open(path) as pdf:
text_ = []
table = []
for page in pdf.pages:
text = page.extract_text()

if text:
text_.append(text)
for table in page.extract_tables():
table_summaries.append({"type": "table", "content": table})

final_output = text_ + table


return final_output

def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):


if quantization == "4-bit":
qconfig = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8-bit":
qconfig = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_compute_dtype=torch.bfloat16,
)
else:
qconfig = None

tokenizer = IndicTransTokenizer(direction=direction)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)

if qconfig == None:
model = model.to(DEVICE)
if DEVICE == "cuda":
model.half()

model.eval()

return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
translations = []
for i in range(0, len(input_sentences), BATCH_SIZE):
batch = input_sentences[i : i + BATCH_SIZE]

# Preprocess the batch and extract entity mappings


batch = ip.preprocess_batch(batch, src_lang=src_lang,
tgt_lang=tgt_lang)

# Tokenize the batch and generate input encodings


inputs = tokenizer(
batch,
src=True,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model


with torch.no_grad():
generated_tokens = model.generate(
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)

# Decode the generated tokens into text


generated_tokens =
tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

# Postprocess the translations, including entity replacement


translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

del inputs
torch.cuda.empty_cache()

return translations

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
def main_fuc(path,filename):
text = process_pdf(path)
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "eng_Latn", "pan_Guru"

main_text = ''
for i in text:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" #
ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model =
initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)

ip = IndicProcessor(inference=True)
en_sents = [i]
hi_translations = batch_translate(en_sents, src_lang, tgt_lang,
en_indic_model, en_indic_tokenizer, ip)
main_text += hi_translations[0]

# flush the models to free the GPU memory


del en_indic_tokenizer, en_indic_model
# Create a new Document
doc = Document()

# Set 1-inch margins


sections = doc.sections
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)

# Add a paragraph
paragraph = doc.add_paragraph(str(main_text))

# Set font type and size


run = paragraph.runs[0]
run.font.size = Pt(12)

# Use Raavi font


run.font.name = 'Raavi'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), 'Raavi')

# Save the document


doc.save(filename+".docx")
print("complete")

# 1. Filepath 2. Filename
Filepath = "/content/2003_S1_278_306.pdf"
file = "2003_S1_278_306"

#Run
main_fuc(Filepath,"file")

#Output will be saved into docx

<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer


is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89:
UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab
(https://huggingface.co/settings/tokens), set it as secret in your Google Colab and
restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access
public models or datasets.
warnings.warn(
<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer
is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)

#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf

You might also like