0% found this document useful (0 votes)

79 views5 pages

Python Scripts

Contains python script for translation.

Uploaded by

Pragit Sharma

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

79 views5 pages

Python Scripts

Contains python script for translation.

Uploaded by

Pragit Sharma

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 5

#Run

%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

#Run
%%capture
%cd /content/IndicTrans2/huggingface_interface

#Run
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2
mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransTokenizer

%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

#Run
!pip install pdfplumber

Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-

packages (0.11.0)
Requirement already satisfied: pdfminer.six==20231228 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (20231228)
Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.10/dist-
packages (from pdfplumber) (9.4.0)
Requirement already satisfied: pypdfium2>=4.18.0 in
/usr/local/lib/python3.10/dist-packages (from pdfplumber) (4.30.0)
Requirement already satisfied: charset-normalizer>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from pdfminer.six==20231228->pdfplumber)
(3.3.2)
Requirement already satisfied: cryptography>=36.0.0 in
/usr/local/lib/python3.10/dist-packages (from pdfminer.six==20231228->pdfplumber)
(42.0.7)
Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-
packages (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber) (1.16.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-
packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228-
>pdfplumber) (2.22)

#Run
!pip install python-docx

Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-

packages (1.1.2)
Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-
packages (from python-docx) (4.9.4)
Requirement already satisfied: typing-extensions>=4.9.0 in
/usr/local/lib/python3.10/dist-packages (from python-docx) (4.12.1)

##Resart session

# Run
import pdfplumber
from docx import Document
from docx.shared import Pt, Inches
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

#Run
def process_pdf(path):
with pdfplumber.open(path) as pdf:
text_ = []
table = []
for page in pdf.pages:
text = page.extract_text()

if text:
text_.append(text)
for table in page.extract_tables():
table_summaries.append({"type": "table", "content": table})

final_output = text_ + table

return final_output

def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):

if quantization == "4-bit":
qconfig = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8-bit":
qconfig = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_compute_dtype=torch.bfloat16,
)
else:
qconfig = None

tokenizer = IndicTransTokenizer(direction=direction)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)

if qconfig == None:
model = model.to(DEVICE)
if DEVICE == "cuda":
model.half()

model.eval()

return tokenizer, model

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
translations = []
for i in range(0, len(input_sentences), BATCH_SIZE):
batch = input_sentences[i : i + BATCH_SIZE]

# Preprocess the batch and extract entity mappings

batch = ip.preprocess_batch(batch, src_lang=src_lang,
tgt_lang=tgt_lang)

# Tokenize the batch and generate input encodings

inputs = tokenizer(
batch,
src=True,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model

with torch.no_grad():
generated_tokens = model.generate(
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)

# Decode the generated tokens into text

generated_tokens =
tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

# Postprocess the translations, including entity replacement

translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

del inputs
torch.cuda.empty_cache()

return translations

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
def main_fuc(path,filename):
text = process_pdf(path)
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "eng_Latn", "pan_Guru"

main_text = ''
for i in text:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" #
ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model =
initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)

ip = IndicProcessor(inference=True)
en_sents = [i]
hi_translations = batch_translate(en_sents, src_lang, tgt_lang,
en_indic_model, en_indic_tokenizer, ip)
main_text += hi_translations[0]

# flush the models to free the GPU memory

del en_indic_tokenizer, en_indic_model
# Create a new Document
doc = Document()

# Set 1-inch margins

sections = doc.sections
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)

# Add a paragraph
paragraph = doc.add_paragraph(str(main_text))

# Set font type and size

run = paragraph.runs[0]
run.font.size = Pt(12)

# Use Raavi font

run.font.name = 'Raavi'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), 'Raavi')

# Save the document

doc.save(filename+".docx")
print("complete")

# 1. Filepath 2. Filename
Filepath = "/content/2003_S1_278_306.pdf"
file = "2003_S1_278_306"

#Run
main_fuc(Filepath,"file")

#Output will be saved into docx

<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer

is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89:
UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab
(https://huggingface.co/settings/tokens), set it as secret in your Google Colab and
restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access
public models or datasets.
warnings.warn(
<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer
is deprecated.
The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
tokenizer = IndicTransTokenizer(direction=direction)

#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf

Hotel Reservation System
67% (6)
Hotel Reservation System
57 pages
Def Set Random Seed(Seed)
No ratings yet
Def Set Random Seed(Seed)
29 pages
Claude Comparet DB
No ratings yet
Claude Comparet DB
8 pages
Video Api Endpoint N
No ratings yet
Video Api Endpoint N
7 pages
NLP
No ratings yet
NLP
15 pages
Alpaca + Llama-3 8b Full Example - Ipynb - Colab
No ratings yet
Alpaca + Llama-3 8b Full Example - Ipynb - Colab
10 pages
Alpaca + Codellama 34b Full Example - Ipynb - Colab
No ratings yet
Alpaca + Codellama 34b Full Example - Ipynb - Colab
5 pages
LLM Code Ref
No ratings yet
LLM Code Ref
10 pages
Next With Continuos Run
No ratings yet
Next With Continuos Run
4 pages
QA Using Gemini Langchain ChromaDB PDF
No ratings yet
QA Using Gemini Langchain ChromaDB PDF
2 pages
Intent Recognizer
No ratings yet
Intent Recognizer
5 pages
Wa0029.
No ratings yet
Wa0029.
11 pages
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
No ratings yet
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
10 pages
Tutorials Sources Beginner Ptcheat
No ratings yet
Tutorials Sources Beginner Ptcheat
7 pages
Code2pdf 67c73149b96ef
No ratings yet
Code2pdf 67c73149b96ef
4 pages
22BCE9752 NLPDigital Assignment 02
No ratings yet
22BCE9752 NLPDigital Assignment 02
21 pages
Project Source
No ratings yet
Project Source
21 pages
Experimental Pix2pix
No ratings yet
Experimental Pix2pix
5 pages
Vit32 GPTMD
No ratings yet
Vit32 GPTMD
6 pages
Image Caption2
No ratings yet
Image Caption2
9 pages
Run 1
No ratings yet
Run 1
57 pages
Working Setup MulTalk - Windows
No ratings yet
Working Setup MulTalk - Windows
2 pages
MA - Economics - 2nd Sem - MAECO - 204 - Public Finance - May 2023
No ratings yet
MA - Economics - 2nd Sem - MAECO - 204 - Public Finance - May 2023
7 pages
AI Lab6
No ratings yet
AI Lab6
22 pages
Pgi20s02j - Lab Record
No ratings yet
Pgi20s02j - Lab Record
24 pages
Research Paper Summarization
No ratings yet
Research Paper Summarization
13 pages
Karpathy MinGPT Model
No ratings yet
Karpathy MinGPT Model
7 pages
Easyocr
No ratings yet
Easyocr
8 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Retorno 1
No ratings yet
Retorno 1
29 pages
Wa0028.
No ratings yet
Wa0028.
5 pages
stable_diffusion_report_updated
No ratings yet
stable_diffusion_report_updated
19 pages
tensor flow programs
No ratings yet
tensor flow programs
30 pages
Langchain Onepager
No ratings yet
Langchain Onepager
1 page
Pipeline Flux Ipa
No ratings yet
Pipeline Flux Ipa
18 pages
QLSTMvs LSTM
No ratings yet
QLSTMvs LSTM
7 pages
Assignment 2.3.1 Transfer Learning
No ratings yet
Assignment 2.3.1 Transfer Learning
7 pages
Exp 11 NLI USING BERT
No ratings yet
Exp 11 NLI USING BERT
4 pages
PyTorch Cheat Sheet & Quick Reference
No ratings yet
PyTorch Cheat Sheet & Quick Reference
6 pages
Image Captioning With Visual Attention PDF
No ratings yet
Image Captioning With Visual Attention PDF
16 pages
Lab 1 Summarize Dialogue
No ratings yet
Lab 1 Summarize Dialogue
26 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Trainrealfill
No ratings yet
Trainrealfill
19 pages
Deep Learning
No ratings yet
Deep Learning
46 pages
CV Prince
No ratings yet
CV Prince
120 pages
Sampleui
No ratings yet
Sampleui
3 pages
Gen Ai 7,8,9,10
No ratings yet
Gen Ai 7,8,9,10
7 pages
Message
No ratings yet
Message
3 pages
Medical Text Classifier GabrieldeOlaguibel
No ratings yet
Medical Text Classifier GabrieldeOlaguibel
12 pages
Lab 5
No ratings yet
Lab 5
7 pages
Code Explanation
No ratings yet
Code Explanation
8 pages
Language Translation With NN - Transformer and Torchtext - PyTorch Tutorials 2.3.0+cu121 Documentation
No ratings yet
Language Translation With NN - Transformer and Torchtext - PyTorch Tutorials 2.3.0+cu121 Documentation
8 pages
Natural Language Processing With Pytorch Readthedocs Io en Latest PDF
No ratings yet
Natural Language Processing With Pytorch Readthedocs Io en Latest PDF
35 pages
Kijai ComfyUI VEnhancer
No ratings yet
Kijai ComfyUI VEnhancer
76 pages
Retalking For High Resolution - Ipynb
No ratings yet
Retalking For High Resolution - Ipynb
1 page
Fine-Tuned Vs RAG Short Notes ?
No ratings yet
Fine-Tuned Vs RAG Short Notes ?
25 pages
Largescaiass 2
No ratings yet
Largescaiass 2
7 pages
Assignment 7
No ratings yet
Assignment 7
10 pages
ML 1
No ratings yet
ML 1
22 pages
GxAlert Install Using GxConnect - Mozambique
No ratings yet
GxAlert Install Using GxConnect - Mozambique
12 pages
A Guide To Effective Google Searching: For Beginners
No ratings yet
A Guide To Effective Google Searching: For Beginners
11 pages
Ch01 Introduction To Computer User Support
No ratings yet
Ch01 Introduction To Computer User Support
17 pages
Curriculum Vitae OF Ayesha Khanam: Career Objective
No ratings yet
Curriculum Vitae OF Ayesha Khanam: Career Objective
2 pages
Top 50+ Linux Commands You MUST Know - DigitalOceanZZZ
No ratings yet
Top 50+ Linux Commands You MUST Know - DigitalOceanZZZ
45 pages
Sekolah Tun Fatimah Jalan Tun Abdul Razak, 80000 Johor Bahru
No ratings yet
Sekolah Tun Fatimah Jalan Tun Abdul Razak, 80000 Johor Bahru
12 pages
Ansible
No ratings yet
Ansible
4 pages
MS Word Details
No ratings yet
MS Word Details
28 pages
45MG Upgrade Instructions Using Upgrade 2010
100% (1)
45MG Upgrade Instructions Using Upgrade 2010
3 pages
Readme Automated Tests
No ratings yet
Readme Automated Tests
2 pages
Written CCCaster Tutorial
No ratings yet
Written CCCaster Tutorial
2 pages
Informatica MCQs Set - 2 - Informatica Training & Programing Free Tutorials
No ratings yet
Informatica MCQs Set - 2 - Informatica Training & Programing Free Tutorials
2 pages
Object Oriented Analysis and Design - Lecture Notes, Study Material and Important Questions, Answers
No ratings yet
Object Oriented Analysis and Design - Lecture Notes, Study Material and Important Questions, Answers
4 pages
RandARP User Guide
No ratings yet
RandARP User Guide
24 pages
DB2 Load
No ratings yet
DB2 Load
20 pages
C++ - When To Use ' - Fastcall' Calling Convention - Stack Overflow
No ratings yet
C++ - When To Use ' - Fastcall' Calling Convention - Stack Overflow
2 pages
Utility Software
No ratings yet
Utility Software
2 pages
ICSO - Sample - Class 1 - 1 - Answer
No ratings yet
ICSO - Sample - Class 1 - 1 - Answer
3 pages
Independently Analyze Data Files.: Suggested Answers
No ratings yet
Independently Analyze Data Files.: Suggested Answers
1 page
Library Management System: A Project Report On
No ratings yet
Library Management System: A Project Report On
27 pages
Diptendu Tan: Highlights
No ratings yet
Diptendu Tan: Highlights
3 pages
Software Project Management: Telone Centre For Learning
No ratings yet
Software Project Management: Telone Centre For Learning
10 pages
Musicplayer - Docx Usining Python
No ratings yet
Musicplayer - Docx Usining Python
45 pages
PDF 24
No ratings yet
PDF 24
3 pages
Dell Emc Data Protection Suite Family
No ratings yet
Dell Emc Data Protection Suite Family
4 pages
Stack Overflow - Learning Lua
No ratings yet
Stack Overflow - Learning Lua
89 pages
Vivo V2419 V2419 2025-06-02 01-33-30
No ratings yet
Vivo V2419 V2419 2025-06-02 01-33-30
18 pages
Poweredge-4161ds User's Guide3 En-Us
No ratings yet
Poweredge-4161ds User's Guide3 En-Us
112 pages
Ashish Jaiswal Resume2024
No ratings yet
Ashish Jaiswal Resume2024
3 pages

Python Scripts

Uploaded by

Python Scripts

Uploaded by

#Run

!git clone https://github.com/VarunGumma/IndicTransTokenizer

Requirement already satisfied: pdfplumber in /usr/local/lib/python3.10/dist-

Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-

final_output = text_ + table

def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):

return tokenizer, model

# Preprocess the batch and extract entity mappings

# Tokenize the batch and generate input encodings

# Generate translations using the model

# Decode the generated tokens into text

# Postprocess the translations, including entity replacement

src_lang, tgt_lang = "eng_Latn", "pan_Guru"

# flush the models to free the GPU memory

# Set 1-inch margins

# Set font type and size

# Use Raavi font

# Save the document

#Output will be saved into docx

<ipython-input-2-5c32b1d8a4f9>:34: DeprecationWarning: This IndicTransTokenizer

#process_pdf("/content/2006_1_138_148.pdf") # to see text from pdf

You might also like