forked from Sefaria/Sefaria-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_mishnah_terms.py
40 lines (31 loc) · 1.49 KB
/
parse_mishnah_terms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
from sefaria.model import *
import csv
def create_word_form(form, lang, lookups):
return WordForm({'form': form, 'language_code': lang, 'lookups': lookups})
with open('data/tmp/Halachic Terminology Berachot.tsv', 'rb') as csvfile:
lexicon_name = 'Halachic Terminology'
mishna_eng = Lexicon({'name': lexicon_name, 'language': 'heb.mishnaic', 'to_language': 'eng' })
mishna_eng.save()
lex_csv = csv.reader(csvfile, delimiter='\t')
next(lex_csv, None)
for entry in lex_csv:
dict_entry ={
'headword': entry[1].strip(),
'parent_lexicon': lexicon_name,
'content' : {'definition': entry[5].strip()}
}
if entry[4].strip() != '':
try:
nref = Ref('Mishnah %s' % entry[4].strip()).normal()
dict_entry['content']['refs'] = [{'definition': '', 'ref': nref}]
except Exception as e:
pass
LexiconEntry(dict_entry).save()
forms = [(entry[0], 'eng'), (entry[1], 'heb')]
forms += [(x, 'eng') for x in entry[2].split(",")]
forms += [(x, 'heb') for x in entry[3].split(",")]
for form in forms:
if form[0] and form[0] != '':
print unicode(form[0].strip(), 'utf-8').encode('utf-8')
create_word_form(form[0], form[1], [{'headword':entry[1], 'lexicon': lexicon_name }]).save()