# -*- coding: utf-8 -*- """ Created on Thu Mar 27 10:37:23 2025 @author: Arnd Hemlut Hafner """ ############# """ This script encodes different versions of transcriptions of the same text in order to collate them with each other. Main processing flow: Creating TEI encoded xml tree from every textfile version of text. Outputting encoded xml text in textfile format for checking. Comparing xml element sequences, creating TEI encoded xml file entailing one lemmatized version and annotations concerning variations. Output in different html formats. Structure of script: 1. Main body 1.1. Initiationg tag and glyph dictionaries 1.2. Reading input files, creating TEI encoded texts and converting them into XML files. 1.3. Outputting encoded texts for checking purpose. 1.4. Collating encoded texts and creating one lemmatized version of encoded text (not yet written) 2. Creator module "createTEItexts" 2.1. Initiating new instance of class EncodedText 2.2. Reading text from textfiles line by line, 2.3. Discerning subtitles from main text, and feeding them into encoding class EncodedText. Calling the following functions of class EncodedText during processing: 3.2. setsubsection() 3.3. addmaintext() 3.4. completesubsection() 3.5. completeencoding() 2.4. Converting encoded texts into xml-objects 3. Encoding class EncodedText 3.1. Creating empty and elements in textform, and initiating several control variables(__init__) 3.2. Setting up a new subsection in for every heading discovered in the text by creator module (setsubsection) 3.3. Processing main text glyph by glyph and adding them to element (addmaintext), setting up and closing text line elements in before and after processing. Calling the following class functions during processing: 3.3.1. settextline() 3.3.2. chopfirstglyph() 3.3.2.1. Pooling regex and functions in list Loop 3.3.2.2. Looping over regex, chopping glyph/glyph-units, and delegating creation of apparatuses to functions. In case of regex matching, the loop is exited after creation of apparatus. Order of application of regexs matters! 3.3.3. completetextline() 3.4. Closing subsection of before every new heading and before ending entire encoding process (completesubsection) 3.5. Closing all subsections of and as well as the two elements themselves, and outputting the glyph dictionary generated during processing (completeencoding) 3.6. Several auxiliary functions used by 3.3.2. chopfirstglyph() 4.Collating module (not yet created) Functions jointly used by class EncodedText """ import sys # for sys.exit after error import re from re import compile, search import os from bs4 import BeautifulSoup #for creation of xml element from tags import Tags #dictionary of tag names from collections import namedtuple as _namedtuple #used by newdiff from pprint import pprint #used by newdiff #1. Main body #1.1. Initiationg tag and glyph dictionaries tags = Tags().tags class Glyph(): """ 主として漢字もglyphとしてエンコーディングするための辞書。 """ glyphcount = -1 def __init__(self): #glyphnodic== {glyph:glyphno} with open('glyphs.txt', 'r',encoding='utf-8-sig') as file: self.glyphnodic = {key.strip(): value.strip() for key, value in (line.split(':', 1) for line in file)} #glyphdic== {glyphno:glyph} with open('glyphs.txt', 'r',encoding='utf-8-sig') as file: self.glyphdic = {value.strip(): key.strip() for key, value in (line.split(':', 1) for line in file)} self.nonunicode = {} def lookupglyphnumber(self,glyph): """ 字単位のエンコーディングを行う際に、辞書への収録を確認し、必要に応じて 辞書登録を行いつつ、glyphコードを返す。 原文句読記号や現代の句読点を除いては、0から始まる通し番号を附してエンコーディングを行う。 Parameters ---------- glyph : str 漢字などの文字。一字のみ Returns ------- str glyphコード() """ if glyph in self.glyphnodic.keys() : pass else: type(self).glyphcount += 1 glyphno = str(type(self).glyphcount) self.glyphnodic[glyph] = glyphno self.glyphdic[glyphno] = glyph if '{' in glyph: name = '00000' + str(len(self.nonunicode)+1) name = name[-6:] self.nonunicode[glyph] = name return self.glyphnodic[glyph] def encodeglyph(self,glyph,typ='original',cert='high'): """ Converts single glyphs into glyph elements. Glyph element default format: Indication of doubts ('(?)'): "high" is replaced with "low" Indication of context based readings (補釋=打框字): "original" is replaced with "context" Parameters ---------- glyph : str glyph to be converted typ : str, optional Indication of transcription type. The default is 'original'. cert : str, optional Indication of confidence in transcription. The default is 'high'. Returns ------- glyph : str element in text format. Is to be converted into xml element finally. """ #format check for glyph if len(re.findall('(?:{[^}]+}|.)',glyph)) > 1: sys.exit('Error in encodeglyph for {}'.format(glyph)) #create default glyph = re.sub('glyphnumber',self.lookupglyphnumber(glyph),tags['glyph']) #adjust attributes if typ !='original': glyph = re.sub('original',typ,glyph) if cert !='high': glyph = re.sub('high',cert,glyph) return glyph def decodeglyph(self,glyph): """ Returns glyphs into ordinary glyphs Adds '&' or '?' for context-based or uncertain readings(大框字或存疑) Parameters ---------- glyph : str DESCRIPTION. Returns ------- glyph : str 字 or 字& or 字? or 字&? """ m1 = re.search('ref="#([^"]+)"',glyph) m2 = re.search('type="([^"]+)"',glyph) m3 = re.search('cert="([^"]+)"',glyph) glyphno,typ,cert = m1.group(1),m2.group(1),m3.group(1) glyph = self.glyphdic[glyphno] if typ == 'context':glyph +='&' if glyph == 'low':glyph += '?' return glyph def encodeglyphstring(self,string): if '&' in string: sys.exit('Precaution! Glyph & in string to be encoded! String is {}'.format(string)) elif '﹦' in string: sys.exit('Precaution! Glyph ﹦ in string to be encoded! String is {}'.format(string)) glyph = re.findall('(?:{[^}]+}|.)??',string) for g in glyph: if not re.search('[<>/a-zA-Z\\d\\./"_ #\\(\\)=]',g): if re.search('.?', g): string = re.sub(g,self.encodeglyph(re.sub('?$','',g),'original','low'),string,1) else: string = re.sub(g,self.encodeglyph(g),string,1) return string def glyphoutput(self,outputfile): """ glyphとしてエンコーディングされた文字の辞書を出力する Parameters ---------- dict : dic エンコーディングにおいて生成された文字辞書(目下は原文句読記号のみ) outputfile : I/O Wrapper Returns ------- None. """ for key, val in self.glyphnodic.items(): outputfile.write('\n'.format(val)) if '{' in key: outputfile.write('\t{}\n'.format(key)) outputfile.write('\t\n'.format(self.nonunicode[key])) else: outputfile.write('\t{}\n'.format(key)) outputfile.write('\n') def nonunicodeoutput(self,outputfile): for key, val in self.nonunicode.items(): outputfile.write('{}\t{}\n'.format(val,key)) glyphs = Glyph() appIDformat = compile('0*(\\d{4,})') #tools for encoding process def createpointer(pointerID,glyphcount): pointer = tags['pointerglyph'] return re.sub('pointerID',pointerID,re.sub('字數',str(glyphcount),pointer)) def searchpreparation(string,sufflate,prefix = '',suffix = ''): ''' Inserts regex into strings in order to suppress interference of irregular symbols during search Parameters ---------- string : str searched string sufflate : str Regex inserted between the constituent glyphs of the string prefix : str, optional Regex inserted before the string. The default is ''. suffix : str, optional Regex inserted behind the string. The default is ''. Returns ------- string: str ''' #記号交じりの原文から文字列を検索するために、文字列を正規表現に置き換える #string = 検索したい文字列 #prefex = 原文では検索したい文字列の前にあり得る記号のリストの正規表現 #sufflate = 検索したい文字列の間に出現しうる記号のリストの正規表現 #suffix = 検索したい文字列の後ろに出現しうる記号のリストの正規表現 string = re.findall('.',string) tempstring = prefix + string[0] for c in range(1,len(string)): tempstring += sufflate + string[c] string = tempstring + suffix return(string) def hyphenencoding1(string,tagtrunk1,tagtrunk2 ='reg'): """ 字1=字2=(字1-字3,字2)における"字1-字3"の部分にタグを付ける Parameters ---------- string : str タグ付け対象文字列 tagtrunk1 : str 字1-字3における字1を記述するタグの根幹部分 tagtrunk : str 字1-字3における字3を記述するタグの根幹部分 Returns ------- string = str """ replace = compile('((?:{[^}]+}|.)??)\\-((?:{[^}]+}|.)??)') return replace.sub('{}{}\\g<1>{}{}\\g<2>{}{}'.format(tags['choiceb'],tags[tagtrunk1+'b'],tags[tagtrunk1+'e'],tags[tagtrunk2+'b'],tags[tagtrunk2+'e'],tags['choicee']),string) def punctuationencoding(string): punctuationmarks = re.findall('([。,、?!:;])', string) for mark in punctuationmarks: glyph = glyphs.encodeglyph(mark) string = re.sub(mark,glyph,string,1) return string #tools for xml convertion and xml object transformations def createxml(encodedTexts): """ Transforming encoded texts into xml-objects Parameters ---------- encodedTexts : list List of encoded texts Returns ------- xml_Texts : list List of xml convertions of encoded texts """ xml_Texts = [] for encodedText in encodedTexts: xml_Text = BeautifulSoup(encodedText.body + ''.join(encodedText.back),'xml') xml_Texts.append(xml_Text) return xml_Texts def attrlistup(xmlobject,tag,attr): attrlist = [] for tag_element in xmlobject.find_all(tag): attrlist.append(tag_element[attr]) return attrlist #Other tools #3 Class for encoding class EncodedText: """ First, one ecoded text for every transcription of the text is created ("single version"), second, after collation of different transcriptions, one comprehensive encoded text entailing all differences between different transcription soucres is created ("lemmatized version"). Structure of TEI encoded text: 1)Single version(for details see Sample(TEIencoding,(single version).xml)): ...
subtitle

plain sliptext

...
subtitle

plain sliptext

...
.../lem>
.../lem>
.../lem>
......
2)Lemmatized version In add one
to annotate differences in primary transcription. In every add , add @source also to . Xml:id for every source is documented in within of """ #3.1.Creating empty and elements in textform, #and initiating control variables(__init__) def __init__(self): self.body = tags['textb'] + tags['bodyb'] self.back = [tags['backb']+tags['divcoll1'],tags['divcoll2'],tags['divdupl'],tags['divrepl'],tags['divsuppliedelem'],tags['divsuppliedinterp']] self.prevglyph = ''#necessary to check the usage of the original punctuation mark '┘'. self.dupstretchover = ()#for storage of duplication mark expressions that stretch over two slips. self.prevslipno = '' self.apparatus = '' self.appcount = 0 self.appID = '' self.glyphcount = 1 #3.2. Setting up a new subsection in def setsubsection(self,subtitle): self.body += tags['divsection'] + tags['headb'] + subtitle + tags['heade'] #3.3. Processing main text glyph by glyph def addmaintext(self,text,slipno): #Resetting pointer ID used to link apparatus with main text lines slipno = 'J' + slipno okikae = compile('\\((\\d+)\\)') #slipno will be used as xml:id, parantheses etc. not allowed→converting (2) and (3) into J2J and J3J if re.search(okikae,slipno): slipno = okikae.sub('S\\g<1>E',slipno) self.pointerID = self.slipno = slipno #Setting up new text line element in , while checking for stretchovers from previous text line self.settextline(slipno) #if stretchovers should pushed into following line: #if not self.dupstretchover:self.settextline(slipno) #Resetting glyph count used to link apparatus with particular glyph elements in the main text line self.slipglyphcount = self.glyphcount = 1 #Processing text glyph by glyph while text: text = self.chopfirstglyph(text) #Closing text line element of , while checking for stretchovers to following text line self.completetextline() #if stretchovers should pushed into following line: #if not self.dupstretchover:self.completetextline() #Resetting control variables self.prevslipno = self.slipno self.prevglyph = '' #3.3.1. def settextline(self,slipno): self.body += re.sub('簡號',slipno,tags['pb']) #3.3.2. def chopfirstglyph(self,text): """ Chop the first glyph or glyph unit from the text line of a slip, returning the left glyphs. Annotations of glyphs embedded in the sliptext are taken together with the annotated glyphs as glyph units. That means "字(字)" and "字﹦字﹦(字字,字字)" etc. are handled and counted as one glyph unit. Chopped glyphs or glyph units are inputted into apparatus creating functions. Main Flow: Pooling regex and functions in list Loop Looping over loop, chopping glyph/glyph-units, and delegating creation of apparatus to functions (self.prevglyph is used as storage for preceding glyphs, only used for apparatus creation of original punctuation mark '┘') Parameters ---------- text : str sliptext Returns ------- text : str """ def addnewtext(text): if 'J' in self.pointerID: self.body += text elif 'A' in self.pointerID: self.apparatus += text else: sys.exit('Error in addnewtext for glyph {} on slip no {}'.format(text,self.slipno)) def addnewapp(apparatus,n): #n = number of list except for apparatuses of supplements if 'J' in self.pointerID: self.back[n] += apparatus elif 'A' in self.pointerID: self.back[5] += apparatus else: sys.exit('Error in addnewapp for glyph {} on slip no {}'.format(text,self.slipno)) def addglyphcount(newtext,glyphcount = 0): if not glyphcount: glyphcount = len(re.findall('<[gs]',newtext,)) self.glyphcount += glyphcount def appsingle(self,m): if '&' in m.group(0) : sys.exit('Undetected & in slipno {}'.format(self.slipno)) elif '﹦' in m.group(0) : sys.exit('Undetected ﹦ in slipno {}'.format(self.slipno)) newtext = glyphs.encodeglyph(m.group(0)) addnewtext(newtext) addglyphcount(newtext) def native_tags(self,m): if re.search('^', m.group(3)),tagtrunk1,tagtrunk2) else: apparatus += tags[tagtrunk1 + 'b'] + mainglyph + tags[tagtrunk1 + 'e'] apparatus += tags[tagtrunk2 + 'b'] + glyphs.encodeglyphstring(regex.sub('\\g<1>',m.group(3))) + tags[tagtrunk2 + 'e'] addnewapp(self.completeapparatus(apparatus),3) return def apperror(self,m): replacement(m,'sic','corr') def appreinterpret(self,m): replacement(m,'orig','reg') def appredund(self,m): newtext = glyphs.encodeglyph(m.group(1)) addnewtext(newtext) apparatus = self.setapparatus() + tags['surplusred'] + newtext + tags['surpluse'] pointer1 = createpointer(self.pointerID,self.glyphcount) addglyphcount(newtext) pointer2 = createpointer(self.pointerID,self.glyphcount-1) apparatus = re.sub('pointer1',pointer1,re.sub('pointer2',pointer2, apparatus)) addnewapp(self.completeapparatus(apparatus),3) def appdoubt(self,m): if m.group(2):#"字&(?)" #m.group(2) = "&" #m.group(1) = 字 newtext = glyphs.encodeglyph(m.group(1),'context','low') else: newtext = glyphs.encodeglyph(m.group(1),'original','low') addnewtext(newtext) addglyphcount(newtext) def apporipunct(self,m): """ Handles the original puntuation mark '┘' Parameters ---------- m : re.Match regex == r'^(┘)(&?)(?(??)([。,、?!:;]?))?' Returns ------- None (Adds original punctuation mark to text and increases glyphcount). """ oripunct = m.group(1) if m.group(4): #group(4) =="([。,、?!:;]?)" before ")"→"┘(?。)" or "┘(,)" etc. n = re.search('^({[^}]+}|.)(&?)(([^)]+))',m.group(0)) appreinterpret(self,n)#Handles supplied modern punctuation like other replacements return#preventing counting self.glyphcount twice. elif m.group(3): #group(3) == "(??)" after "("→ "┘(?)" if not self.prevglyph or self.prevglyph in '。,、?!:;':#Cannot be a punctuation mark n = re.search('^({[^}]+}|.)(&?)(?)',m.group(0)) appdoubt(self,n) else:#Most probably a punctuation mark, but also possibility of doubt n = re.search('^({[^}]+}|.)(&?)(([^)]+))',m.group(0)) appreinterpret(self,n) print('Precaution! Ambivalence in slip {} for {}. {} was tentatively interpreted as a replacement expression. Check whether "?" was not intended to express doubt about the reading!'.format(self.slipno,m.group(0),m.group(0))) return#preventing counting self.glyphcount twice. elif m.group(2): newtext = glyphs.encodeglyph(oripunct,'context') else: newtext = glyphs.encodeglyph(oripunct) addnewtext(newtext) addglyphcount(newtext) def appduplication(self,m): slipno = self.slipno pointerID = self.pointerID #abbr = "字﹦", "字﹦字﹦"etc., expan ="(字字)","(字字,字字)" etc. abbr,expan = m.group(1),m.group(2) elementlist = re.findall('(?:{[^}]+}|[^()&﹦])[(?)&]*﹦[(?)&]*',abbr) def appinit(): apparatus = self.setapparatus() + tags['choiceb'] + tags['abbrb'] #Searching for fragments of duplication expression at the end of previous slip prevabbr = '' prevslipno = self.prevslipno if self.dupstretchover: if prevslipno != self.dupstretchover[0]: sys.exit('duplication stretching over more than 2 slips\ncurrent slip {}\nstretching from {}\nprevslipno is {}'.format(slipno,self.dupstretchover[0],prevslipno)) prevslipno,prevglyphcount,prevabbr = self.dupstretchover self.dupstretchover = '' pointer1 = createpointer(prevslipno,prevglyphcount) else: pointer1 = createpointer(pointerID,self.glyphcount) addglyphcount(''.join(elementlist),len(elementlist)*2) pointer2 = createpointer(pointerID,self.glyphcount-1) return prevabbr, re.sub('pointer1',pointer1,re.sub('pointer2',pointer2, apparatus)) def abbrencoding(abbr): abbrenc= abbr #handling annotations on context based transcriptions(補釋=打框字) and doubts firstglyph = re.findall('^(?:{[^}]+}|.)',abbr)[0] #annotations on plain text found = False for regex1,regex2,typ,cert in [('','&(?)','context','low'), ('','&','context','high'), ('','(?)','original','low'), ('^','﹦','original','high')]: regex = regex1 + firstglyph + regex2 if re.search(re.compile(regex),abbr): firstglyphencode = glyphs.encodeglyph(firstglyph,typ,cert) if regex1:regex = firstglyph abbrenc = re.sub(regex,firstglyphencode,abbrenc,1) abbrtmp = re.sub(regex,firstglyphencode,abbr,1) found = True break if not found: sys.exit('unknown element in duplication mark matching(1st if)! Element is {},firstglyph is {}'.format(element,firstglyph)) #annotations on duplication marks firstglyph = firstglyphencode found = False for regex,typ,cert in [('﹦&(?)','context','low'), ('﹦&','context','high'), ('﹦(?)','original','low'), ('﹦$','original','high')]: if re.search(firstglyph + regex,abbrtmp): abbrenc = re.sub(regex,'﹦',abbrenc,1) abbrenc = re.sub(firstglyph + '﹦',firstglyph + tags['amb'] + glyphs.encodeglyph('﹦',typ,cert) + tags['ame'],abbrenc,1) found = True break if not found: sys.exit('unknown element in duplication mark matching(2nd if)! Element is {},firstglyph is {}'.format(element,firstglyph)) return abbrenc def expanbreakdown(abbr,expan,abbrlen): """discerning two parts of expansion(expan): expanmain == the string directly corresponding to original glyphs ex == the string created based on an interpretation of the duplication marks The main problem to solve is the random occurence of modern punctuation marks within the expansion. """ if abbrlen > 1 and '-' not in expan:#'肆﹦室﹦(肆、室。肆、室)'等に対応する abbr2 = searchpreparation(abbr,'[。,、?!:;]?') elif '-' in expan:#'气﹦鞫﹦(气-乞鞫,气-乞鞫)'等 abbr2 = searchpreparation(abbr,'(?:\\-.)?[。,、?!:;]?','','(?:\\-.)?') else:abbr2 = abbr regex = compile('(' + abbr2 + '[。,、?!:;]?)') m1 = re.search(regex,expan) if m1: expanmain = m1.group(1) ex = re.sub(regex,'',expan,1) else:sys.exit('Mismatch in appabbr for {}\nbango is {}\n'. format(expan,slipno)) return expanmain, ex def expanencoding(expanmain,ex): #handling "字-字" constellations if '-' in expan: #encoding "字-字" constellations expanmain,ex = hyphenencoding1(expanmain,'orig'),hyphenencoding1(ex,'ex') else: ex = tags['exb'] + ex + tags['exe'] #Encoding punctuation marks in expanmain and ex. expanmain = punctuationencoding(expanmain) ex = punctuationencoding(ex) return glyphs.encodeglyphstring(expanmain), glyphs.encodeglyphstring(ex) #initializing apparatus element and handling stretchovers prevabbr, apparatus = appinit() #encoding abbreviation part of dulplication mark expression #dividing"字=字="into"字=" elements, encode transcription annotations("&","(?)"etc), #and pushing sliptext to body if prevabbr: prevabbrenc = abbrencoding(prevabbr) + tags['lb'] """ The following script lines apply to the case that stretchovers should be completely pushed to following line. newtext = re.sub('','',prevabbrenc) addnewtext(newtext) self.completetextline() self.settextline(slipno) """ apparatus += prevabbrenc abbrenc = '' for element in elementlist: abbrenc += abbrencoding(element) newtext = re.sub('','',abbrenc) addnewtext(newtext) apparatus += abbrenc + tags['abbre'] + tags['expanb'] #encoding expansion part of duplication mark expression abbr = prevabbr + abbr abbrlen = len(re.findall('﹦',abbr)) abbr = re.sub('[&﹦(?)]','',abbr) expanmain, ex = expanbreakdown(abbr,expan,abbrlen) #encoding of expanmain and ex expanmainenc, exenc = expanencoding(expanmain,ex) #putting expanmain and ex together again if abbr[0] == expan[0]: apparatus += expanmainenc + exenc else:#'夫=(大夫)' etc. apparatus += exenc + expanmainenc #finishing apparatus and pushing to apparatus += tags['expane'] + tags['choicee'] addnewapp(self.completeapparatus(apparatus),2) def appdupstretchover(self,m): self.dupstretchover = (self.slipno,self.glyphcount,m.group(0)) for n in ((1,2,3),(4,5,6)): glyph = m.group(n[0]) if m.group(n[1]): typ = 'context' else: typ = 'original' if m.group(n[2]): cert = 'low' else: cert = 'high' newtext = glyphs.encodeglyph(glyph,typ,cert) addnewtext(newtext) addglyphcount(newtext) def appdupundetected(self,m): sys.exit('undetected duplication mark in slip {}\n{}'.format(self.slipno,m.group(0))) #3.3.2.1. Pooling regex and functions in list Loop looplist = [ #native tags (re.compile(r'^<[^>]+>'),native_tags), #duplication mark expressions (re.compile(r'^((?:(?:{[^}]+}|[^()&﹦])&?(?:(?))?﹦&?(?:(?))?)+)(([^)]+))'),appduplication), #duplication mark expressions that stretch over two lines (re.compile(r'^({[^}]+}|[^()&﹦])(&)?((?))?(﹦)(&)?((?))?$'),appdupstretchover), #check for undetected duplication marks (re.compile(r'^(?:{[^}]+}|[^()&﹦])﹦'),appdupundetected), #the original punctuation mark '┘' (re.compile(r'^(┘)(&?)(?(??)([。,、?!:;]?))?'),apporipunct), #doubts on transcription ("字(?)"; "┘(?)" already processed previously) (re.compile(r'^({[^}]+}|.)(&?)(?)'),appdoubt), #redundant characters (re.compile(r'^[([^]]+)]'),appredund), #reinterpretation of characters("字(字)") (re.compile(r'^({[^}]+}|.)(&?)(([^)]+))'),appreinterpret), #errors("字〔字〕") (re.compile(r'^({[^}]+}|.)(&?)〔([^)]+)〕'),apperror), #unknown number of illegible characters (re.compile(r'^……'),appillegible), #omissions("〖字〗") (re.compile(r'^〖'),appomission1), (re.compile(r'^〗'),appomission2), #damage("【…】") (re.compile(r'^【'),appdamage1), (re.compile(r'^】'),appdamage2), #context based transcriptions(補釋=打框字) (re.compile(r'^({[^}]+}|.)&(?:(?))?'),appcontext), #modern punctuation marks (re.compile(r'^([。,、;:?!])'),apppunct), #single characters(non-unicode glyph="{A+B}") (re.compile(r'^(?:{[^}]+}|.)'),appsingle)] #3.3.2.2. Looping over regexs and functions for regex, function in looplist: m = re.search(regex,text) if m: text = re.sub(m.group(0),'',text,1) function(self,m) self.prevglyph = m.group(0) return text sys.exit('No matching glyph found in function chopfirstglyph for slipno {}\nleft slip text is {}'.format(self.slipno,text)) #3.3.3.Completing text line def completetextline(self): self.body += tags['pe'] #3.4. Closing subsection of before every new heading def completesubsection(self): self.body += tags['dive'] #3.5. Closing all subsections of and def completeencoding(self): self.body += tags['dive'] + tags['bodye'] for n in range(len(self.back)): self.back[n] += tags['dive'] self.back[-1] += tags['backe'] + tags['texte'] dicstring = re.sub('[{}\']','',re.sub(',','\n',str(glyphs.glyphnodic))) with open('glyphsnew.txt', 'w',encoding='utf-8-sig') as f: f.write(dicstring) #3.6. Auxiliary functions used by 3.3.2. chopfirstglyph() def setapparatus(self,typ = 'range'): self.appcount += 1 apparatus = tags['app' + typ] + tags['lemb'] self.appID = appIDformat.sub('A\\g<1>','000' + str(self.appcount)) apparatus = re.sub('appID',self.appID,apparatus) return apparatus def completeapparatus(self,apparatus): return apparatus + tags['leme'] + tags['appe'] def setsupplement(self,m,typ = 'dam'): if 'A' in self.pointerID: sys.exit('Supplement within supplement for {}{} on slip {}'.format(self.prevglyph,m.group(0),self.slipno)) apparatus = self.setapparatus('left') pointer = createpointer(self.pointerID,self.glyphcount) apparatus = re.sub('pointer1',pointer, apparatus) self.apparatus = apparatus + tags['supplied' + typ + 'b'] #remember glyphcount before resetting it for glyph counting in supplement self.slipglyphcount = self.glyphcount self.glyphcount = 1 #reset pointerID for reference from supplement element to interpretations on supplement element #pointerID before resetting is identical to slipno, no need for recording self.pointerID = self.appID def completesupplement(self,m): if 'J' in self.pointerID: sys.exit('Supplement element end without beginning for {}{} on slip {}'.format(self.prevglyph,m.group(0),self.slipno)) self.apparatus += tags['suppliede'] self.back[4] += self.completeapparatus(self.apparatus) self.apparatus = '' self.pointerID = self.slipno self.glyphcount = self.slipglyphcount #2. Creator module def createTEItexts(filename): """ Reads text from textfiles line by line, discerns subtitles from main text, and feeds everything into encoding mojule. Textfile format(for 嶽麓秦簡《爲獄等狀四種》): Line without numbering = Subsection headings Line with numbering at the beginning = text of 1 slip, number = slipnumber (If textfile format changes, this function as well as the class EncodedText needs to be changed) Parameters ---------- filename : str Returns ------- encodedText : list List """ #2.1. Initiating new instance of class EncodedText et = EncodedText() #2.2. Reading text from textfiles line by line datafile = open(filename, 'r', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) #eliminating empty lines if line == '':continue #2.3. Discerning subtitles and main text, and feeding them in. m = re.search('^([^\t]+)\t(.+)$',line) if m:#main text (text + tab + slipnumber) et.addmaintext(m.group(2),m.group(1))#既存のDivオブジェクトに一つの本文行を追加する。 else:#subtitel (no tabs) if et.prevslipno: et.completesubsection()#closes
of previous subsection et.setsubsection(line) et.completeencoding()#closes last
,,, and datafile.close() return et #4 Collating module #4.1. Comparison tools (based on newdiff.py by Wang Xun) Match = _namedtuple('Match', 'a b size') class SequenceMatcher: def __init__(self, a='', b=''): self.a = self.b = None self.set_seqs(a, b) def set_seqs(self, a, b): self.set_seq1(a) self.set_seq2(b) def set_seq1(self, a): if a is self.a: return self.a = a self.matching_blocks = self.opcodes = None def set_seq2(self, b): if b is self.b: return self.b = b self.matching_blocks = self.opcodes = None self.fullbcount = None self.__chain_b() def __chain_b(self): b = self.b self.b2j = b2j = {} for i, elt in enumerate(b): indices = b2j.setdefault(elt, []) indices.append(i) def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): a, b, b2j = self.a, self.b, self.b2j if ahi is None: ahi = len(a) if bhi is None: bhi = len(b) besti, bestj, bestsize = alo, blo, 0 j2len = {} nothing = [] for i in range(alo, ahi): j2lenget = j2len.get newj2len = {} for j in b2j.get(a[i], nothing): # a[i] matches b[j] if j < blo: continue if j >= bhi: break k = newj2len[j] = j2lenget(j-1, 0) + 1 if k > bestsize: besti, bestj, bestsize = i-k+1, j-k+1, k j2len = newj2len return Match(besti, bestj, bestsize) def get_matching_blocks(self): if self.matching_blocks is not None: return self.matching_blocks la, lb = len(self.a), len(self.b) queue = [(0, la, 0, lb)] matching_blocks = [] while queue: alo, ahi, blo, bhi = queue.pop() i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) if k: # if k is 0, there was no matching block matching_blocks.append(x) if alo < i and blo < j: queue.append((alo, i, blo, j)) if i+k < ahi and j+k < bhi: queue.append((i+k, ahi, j+k, bhi)) matching_blocks.sort() i1 = j1 = k1 = 0 non_adjacent = [] for i2, j2, k2 in matching_blocks: # Is this block adjacent to i1, j1, k1? if i1 + k1 == i2 and j1 + k1 == j2: # Yes, so collapse them -- this just increases the length of # the first block by the length of the second, and the first # block so lengthened remains the block to compare against. k1 += k2 else: # Not adjacent. Remember the first block (k1==0 means it's # the dummy we started with), and make the second block the # new block to compare against. if k1: non_adjacent.append((i1, j1, k1)) i1, j1, k1 = i2, j2, k2 if k1: non_adjacent.append((i1, j1, k1)) non_adjacent.append( (la, lb, 0) ) self.matching_blocks = list(map(Match._make, non_adjacent)) return self.matching_blocks def get_opcodes(self): if self.opcodes is not None: return self.opcodes i = j = 0 self.opcodes = answer = [] for ai, bj, size in self.get_matching_blocks(): tag = '' if i < ai and j < bj: tag = 'replace' elif i < ai: tag = 'delete' elif j < bj: tag = 'insert' if tag: answer.append( (tag, i, ai, j, bj) ) i, j = ai+size, bj+size if size: answer.append( ('equal', ai, i, bj, j) ) return answer def get_grouped_opcodes(self, n=3): codes = self.get_opcodes() if not codes: codes = [("equal", 0, 1, 0, 1)] if codes[0][0] == 'equal': tag, i1, i2, j1, j2 = codes[0] codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 if codes[-1][0] == 'equal': tag, i1, i2, j1, j2 = codes[-1] codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) nn = n + n group = [] for tag, i1, i2, j1, j2 in codes: # End the current group and start a new one whenever # there is a large range with no changes. if tag == 'equal' and i2-i1 > nn: group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) yield group group = [] i1, j1 = max(i1, i2-n), max(j1, j2-n) group.append((tag, i1, i2, j1 ,j2)) if group and not (len(group)==1 and group[0][0] == 'equal'): yield group #4.2. Collating texts def collateTEItexts(xml_Texts): def idcountcheck(xmlobject,tag,idfirst): idcount = max(tag_element['xml:id'] for tag_element in xmlobject.find_all(tag)) idcount = re.sub(idfirst,'',idcount) idcount = int(idcount) + 1 return idcount def collationappcreator(app,pointers,appID,sourcecount): for n in range(len(pointers)) : pointer = 'pointer' + str(n+1) app = re.sub(pointer,pointers[n],app) app = re.sub('appID',appID,app) app += tags['rdgb'] app = re.sub('sourceID',sourcecount,app) app += tags['rdge'] + tags['appe'] app = BeautifulSoup(app,'xml') return app def collation1rdgcreator(app,nvslipnolist,lvslipnolist): tag = app.rdg for slipno in nvslipnolist: if slipno in lvslipnolist: newtag = re.sub('copyID',slipno,tags['pcopy']) newtag = BeautifulSoup(newtag,'xml') tag.append(newtag) else: newtag = nv.find('p', attrs={"xml:id": slipno}) tag.append(newtag) return app def collation2rdgcreator(app,nvglyphs,i1,i2): tag = app.rdg for n in range(i1,i2): newtag = nvglyphs[n] tag.append(newtag) return app #4.2.1. Taking first version as lemmatized version and adding source information lv = xml_Texts[0] for p_element in lv.find_all('p'): p_element['source'] = '01' #4.2.2 Comparing slip numbers of not lemmatized versions with lemmatized version lvslipnolist = attrlistup(lv,'p','xml:id') lvappcount = idcountcheck(lv,'app','A')#Checking the highest apparatus ID and incrementing it by 1. Will be used when creating collation apparatus. for n in range(1,len(xml_Texts)): nv = xml_Texts[n] sourcecount = '0' + str(n+1) #4.2.2.1 Comparing slip numbers nvslipnolist = attrlistup(nv,'p','xml:id') s = SequenceMatcher(nvslipnolist,lvslipnolist) for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'equal': for m in range(j1, j2): tag = lv.find('p', attrs={"xml:id": lvslipnolist[m]}) #tag = lv.find('p', 'xml:id'= lvslipnolist[m]) will cause exemption because 'xml:id' collides with python syntax. tag['source'] += ' ' + sourcecount elif tag == 'replace': app = tags['apprange'] pointers = ['left(' + lvslipnolist[j1] + ')','right(' + lvslipnolist[j2-1] + ')'] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist) tag = lv.find('div', attrs= {"type":"簡序"}) tag.append(app) tag.append('\n') elif tag == 'insert': #slipnumbers not included in nslipnolist app = tags['apprange'] pointers = ['left(' + lvslipnolist[j1] + ')','right(' + lvslipnolist[j2-1] + ')'] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist) tag = lv.find('div', attrs= {"type":"簡序"}) tag.append(app) tag.append('\n') elif tag == 'delete': app = tags['appleft'] pointers = [lvslipnolist[j1]] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist) tag = lv.find('div', attrs= {"type":"簡序"}) tag.append(app) tag.append('\n') #4.2.2.2 Comparing slip text for slipno in lvslipnolist: if slipno in nvslipnolist: lvptag, nvptag = lv.find('p', attrs={"xml:id":slipno}), nv.find('p', attrs={"xml:id":slipno}) lvglyphlist, nvglyphlist = attrlistup(lvptag,'g','ref'), attrlistup(nvptag,'g','ref') nvglyphs = nvptag.find_all('g') s = SequenceMatcher(nvglyphlist,lvglyphlist) for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'equal': pass elif tag == 'replace': app = tags['apprange'] pointers = [createpointer(slipno,j1+1),createpointer(slipno,j2)] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation2rdgcreator(app,nvglyphs,i1,i2) tag = lv.find('div', attrs= {"type":"正文比較"}) tag.append(app) tag.append('\n') elif tag == 'insert':#glyphs not included in nvglyphlist app = tags['apprange'] pointers = [createpointer(slipno,j1+1),createpointer(slipno,j2)] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation2rdgcreator(app,nvglyphs,i1,i2) tag = lv.find('div', attrs= {"type":"正文比較"}) tag.append(app) tag.append('\n') elif tag == 'delete': if j1 < len(lvglyphlist): app = tags['appleft'] pointers = [createpointer(slipno,j1+1)] else: app = tags['appright'] pointers = [createpointer(slipno,j1)] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation2rdgcreator(app,nvglyphs,i1,i2) tag = lv.find('div', attrs= {"type":"正文比較"}) tag.append(app) tag.append('\n') return lv #1. Main body (continue) if __name__ == "__main__": #1.2. Reading input files, creating TEI encoded texts and converting them into XML files. encodedTexts = []#List of encoded texts for n in range(1,100): m = '0' + str(n) filename = 'input' + m[-2:] + '.txt' if os.path.isfile(filename): encodedTexts.append(createTEItexts(filename)) else: print ('Read {} files'.format(str(n-1))) if n == 1: sys.exit('input file not found') else: break xml_Texts = createxml(encodedTexts) #1.3. Outputting encoded texts for n in range(len(encodedTexts)): outputtextbody = encodedTexts[n].body outputtextback = encodedTexts[n].back m = '0' + str(n+1) filename = 'output' + m[-2:] + '.xml' outputfile = open(filename, 'w', encoding='utf-8-sig') outputfile.write(outputtextbody) for m in range(len(outputtextback)): outputfile.write(outputtextback[m]) outputfile.close() #1.4. Collating encoded texts lv = collateTEItexts(xml_Texts) #1.5. Outputting results outputfile = open('output(collated).xml', 'w', encoding='utf-8-sig') with open("TeiHeader(part1).txt", 'r', encoding='utf-8-sig') as file: header = file.read() outputfile.write(header) glyphs.glyphoutput(outputfile) with open("TeiHeader(part2).txt", 'r', encoding='utf-8-sig') as file: header = file.read() outputfile.write(header) outputfile.write(re.sub('^<[^>]+>\n','',str(lv)) + tags['TEIe']) outputfile.close() outputfile = open('output(nonunicode).txt', 'w', encoding='utf-8-sig') glyphs.nonunicodeoutput(outputfile) outputfile.close()