to annotate differences in primary transcription. In every add , add @source also to . Xml:id for every source is documented in within of """ #3.1.Creating empty and elements in textform, #and initiating control variables(__init__) def __init__(self): self.body = tags['textb'] + tags['bodyb'] self.back = [tags['backb']+tags['divcoll1'],tags['divcoll2'],tags['divdupl'],tags['divrepl'],tags['divsuppliedelem'],tags['divsuppliedinterp']] self.prevglyph = ''#necessary to check the usage of the original punctuation mark '┘'. self.dupstretchover = ()#for storage of duplication mark expressions that stretch over two slips. self.prevslipno = '' self.apparatus = '' self.appcount = 0 self.appID = '' self.glyphcount = 1 #3.2. Setting up a new subsection in def setsubsection(self,subtitle): self.body += tags['divsection'] + tags['headb'] + subtitle + tags['heade'] #3.3. Processing main text glyph by glyph def addmaintext(self,text,slipno): #Resetting pointer ID used to link apparatus with main text lines slipno = 'J' + slipno okikae = compile('\$(\\d+)\$') #slipno will be used as xml:id, parantheses etc. not allowed→converting (2) and (3) into J2J and J3J if re.search(okikae,slipno): slipno = okikae.sub('S\\g<1>E',slipno) self.pointerID = self.slipno = slipno #Setting up new text line element in , while checking for stretchovers from previous text line self.settextline(slipno) #if stretchovers should pushed into following line: #if not self.dupstretchover:self.settextline(slipno) #Resetting glyph count used to link apparatus with particular glyph elements in the main text line self.slipglyphcount = self.glyphcount = 1 #Processing text glyph by glyph while text: text = self.chopfirstglyph(text) #Closing text line element of , while checking for stretchovers to following text line self.completetextline() #if stretchovers should pushed into following line: #if not self.dupstretchover:self.completetextline() #Resetting control variables self.prevslipno = self.slipno self.prevglyph = '' #3.3.1. def settextline(self,slipno): self.body += re.sub('簡號',slipno,tags['pb']) #3.3.2. def chopfirstglyph(self,text): """ Chop the first glyph or glyph unit from the text line of a slip, returning the left glyphs. Annotations of glyphs embedded in the sliptext are taken together with the annotated glyphs as glyph units. That means "字（字）" and "字﹦字﹦（字字，字字）" etc. are handled and counted as one glyph unit. Chopped glyphs or glyph units are inputted into apparatus creating functions. Main Flow: Pooling regex and functions in list Loop Looping over loop, chopping glyph/glyph-units, and delegating creation of apparatus to functions (self.prevglyph is used as storage for preceding glyphs, only used for apparatus creation of original punctuation mark '┘') Parameters ---------- text : str sliptext Returns ------- text : str """ def addnewtext(text): if 'J' in self.pointerID: self.body += text elif 'A' in self.pointerID: self.apparatus += text else: sys.exit('Error in addnewtext for glyph {} on slip no {}'.format(text,self.slipno)) def addnewapp(apparatus,n): #n = number of list except for apparatuses of supplements if 'J' in self.pointerID: self.back[n] += apparatus elif 'A' in self.pointerID: self.back[5] += apparatus else: sys.exit('Error in addnewapp for glyph {} on slip no {}'.format(text,self.slipno)) def addglyphcount(newtext,glyphcount = 0): if not glyphcount: glyphcount = len(re.findall('<[gs]',newtext,)) self.glyphcount += glyphcount def appsingle(self,m): if '＆' in m.group(0) : sys.exit('Undetected & in slipno {}'.format(self.slipno)) elif '﹦' in m.group(0) : sys.exit('Undetected ﹦ in slipno {}'.format(self.slipno)) newtext = glyphs.encodeglyph(m.group(0)) addnewtext(newtext) addglyphcount(newtext) def native_tags(self,m): if re.search('^', m.group(3)),tagtrunk1,tagtrunk2) else: apparatus += tags[tagtrunk1 + 'b'] + mainglyph + tags[tagtrunk1 + 'e'] apparatus += tags[tagtrunk2 + 'b'] + glyphs.encodeglyphstring(regex.sub('\\g<1>',m.group(3))) + tags[tagtrunk2 + 'e'] addnewapp(self.completeapparatus(apparatus),3) return def apperror(self,m): replacement(m,'sic','corr') def appreinterpret(self,m): replacement(m,'orig','reg') def appredund(self,m): newtext = glyphs.encodeglyph(m.group(1)) addnewtext(newtext) apparatus = self.setapparatus() + tags['surplusred'] + newtext + tags['surpluse'] pointer1 = createpointer(self.pointerID,self.glyphcount) addglyphcount(newtext) pointer2 = createpointer(self.pointerID,self.glyphcount-1) apparatus = re.sub('pointer1',pointer1,re.sub('pointer2',pointer2, apparatus)) addnewapp(self.completeapparatus(apparatus),3) def appdoubt(self,m): if m.group(2):#"字＆（？）" #m.group(2) = "＆" #m.group(1) = 字 newtext = glyphs.encodeglyph(m.group(1),'context','low') else: newtext = glyphs.encodeglyph(m.group(1),'original','low') addnewtext(newtext) addglyphcount(newtext) def apporipunct(self,m): """ Handles the original puntuation mark '┘' Parameters ---------- m : re.Match regex == r'^(┘)(＆?)（?(？?)([。，、？！：；]?)）?' Returns ------- None (Adds original punctuation mark to text and increases glyphcount). """ oripunct = m.group(1) if m.group(4): #group(4) =="([。，、？！：；]?)" before "）"→"┘（？。）" or "┘（，）" etc. n = re.search('^(｛[^｝]+｝|.)(＆?)（([^）]+)）',m.group(0)) appreinterpret(self,n)#Handles supplied modern punctuation like other replacements return#preventing counting self.glyphcount twice. elif m.group(3): #group(3) == "(？?)" after "（"→ "┘（？）" if not self.prevglyph or self.prevglyph in '。，、？！：；':#Cannot be a punctuation mark n = re.search('^(｛[^｝]+｝|.)(＆?)（？）',m.group(0)) appdoubt(self,n) else:#Most probably a punctuation mark, but also possibility of doubt n = re.search('^(｛[^｝]+｝|.)(＆?)（([^）]+)）',m.group(0)) appreinterpret(self,n) print('Precaution! Ambivalence in slip {} for {}. {} was tentatively interpreted as a replacement expression. Check whether "？" was not intended to express doubt about the reading!'.format(self.slipno,m.group(0),m.group(0))) return#preventing counting self.glyphcount twice. elif m.group(2): newtext = glyphs.encodeglyph(oripunct,'context') else: newtext = glyphs.encodeglyph(oripunct) addnewtext(newtext) addglyphcount(newtext) def appduplication(self,m): slipno = self.slipno pointerID = self.pointerID #abbr = "字﹦", "字﹦字﹦"etc., expan ="（字字）","（字字，字字）" etc. abbr,expan = m.group(1),m.group(2) elementlist = re.findall('(?:｛[^｝]+｝|[^（）＆﹦])[（？）＆]*﹦[（？）＆]*',abbr) def appinit(): apparatus = self.setapparatus() + tags['choiceb'] + tags['abbrb'] #Searching for fragments of duplication expression at the end of previous slip prevabbr = '' prevslipno = self.prevslipno if self.dupstretchover: if prevslipno != self.dupstretchover[0]: sys.exit('duplication stretching over more than 2 slips\ncurrent slip {}\nstretching from {}\nprevslipno is {}'.format(slipno,self.dupstretchover[0],prevslipno)) prevslipno,prevglyphcount,prevabbr = self.dupstretchover self.dupstretchover = '' pointer1 = createpointer(prevslipno,prevglyphcount) else: pointer1 = createpointer(pointerID,self.glyphcount) addglyphcount(''.join(elementlist),len(elementlist)*2) pointer2 = createpointer(pointerID,self.glyphcount-1) return prevabbr, re.sub('pointer1',pointer1,re.sub('pointer2',pointer2, apparatus)) def abbrencoding(abbr): abbrenc= abbr #handling annotations on context based transcriptions(補釋=打框字) and doubts firstglyph = re.findall('^(?:｛[^｝]+｝|.)',abbr)[0] #annotations on plain text found = False for regex1,regex2,typ,cert in [('','＆（？）','context','low'), ('','＆','context','high'), ('','（？）','original','low'), ('^','﹦','original','high')]: regex = regex1 + firstglyph + regex2 if re.search(re.compile(regex),abbr): firstglyphencode = glyphs.encodeglyph(firstglyph,typ,cert) if regex1:regex = firstglyph abbrenc = re.sub(regex,firstglyphencode,abbrenc,1) abbrtmp = re.sub(regex,firstglyphencode,abbr,1) found = True break if not found: sys.exit('unknown element in duplication mark matching(1st if)! Element is {},firstglyph is {}'.format(element,firstglyph)) #annotations on duplication marks firstglyph = firstglyphencode found = False for regex,typ,cert in [('﹦＆（？）','context','low'), ('﹦＆','context','high'), ('﹦（？）','original','low'), ('﹦$','original','high')]: if re.search(firstglyph + regex,abbrtmp): abbrenc = re.sub(regex,'﹦',abbrenc,1) abbrenc = re.sub(firstglyph + '﹦',firstglyph + tags['amb'] + glyphs.encodeglyph('﹦',typ,cert) + tags['ame'],abbrenc,1) found = True break if not found: sys.exit('unknown element in duplication mark matching(2nd if)! Element is {},firstglyph is {}'.format(element,firstglyph)) return abbrenc def expanbreakdown(abbr,expan,abbrlen): """discerning two parts of expansion(expan): expanmain == the string directly corresponding to original glyphs ex == the string created based on an interpretation of the duplication marks The main problem to solve is the random occurence of modern punctuation marks within the expansion. """ if abbrlen > 1 and '-' not in expan:#'肆﹦室﹦（肆、室。肆、室）'等に対応する abbr2 = searchpreparation(abbr,'[。，、？！：；]?') elif '-' in expan:#'气﹦鞫﹦（气-乞鞫，气-乞鞫）'等 abbr2 = searchpreparation(abbr,'(?:\\-.)?[。，、？！：；]?','','(?:\\-.)?') else:abbr2 = abbr regex = compile('(' + abbr2 + '[。，、？！：；]?)') m1 = re.search(regex,expan) if m1: expanmain = m1.group(1) ex = re.sub(regex,'',expan,1) else:sys.exit('Mismatch in appabbr for {}\nbango is {}\n'. format(expan,slipno)) return expanmain, ex def expanencoding(expanmain,ex): #handling "字-字" constellations if '-' in expan: #encoding "字-字" constellations expanmain,ex = hyphenencoding1(expanmain,'orig'),hyphenencoding1(ex,'ex') else: ex = tags['exb'] + ex + tags['exe'] #Encoding punctuation marks in expanmain and ex. expanmain = punctuationencoding(expanmain) ex = punctuationencoding(ex) return glyphs.encodeglyphstring(expanmain), glyphs.encodeglyphstring(ex) #initializing apparatus element and handling stretchovers prevabbr, apparatus = appinit() #encoding abbreviation part of dulplication mark expression #dividing"字=字="into"字=" elements, encode transcription annotations("＆","（？）"etc), #and pushing sliptext to body if prevabbr: prevabbrenc = abbrencoding(prevabbr) + tags['lb'] """ The following script lines apply to the case that stretchovers should be completely pushed to following line. newtext = re.sub('','',prevabbrenc) addnewtext(newtext) self.completetextline() self.settextline(slipno) """ apparatus += prevabbrenc abbrenc = '' for element in elementlist: abbrenc += abbrencoding(element) newtext = re.sub('','',abbrenc) addnewtext(newtext) apparatus += abbrenc + tags['abbre'] + tags['expanb'] #encoding expansion part of duplication mark expression abbr = prevabbr + abbr abbrlen = len(re.findall('﹦',abbr)) abbr = re.sub('[＆﹦（？）]','',abbr) expanmain, ex = expanbreakdown(abbr,expan,abbrlen) #encoding of expanmain and ex expanmainenc, exenc = expanencoding(expanmain,ex) #putting expanmain and ex together again if abbr[0] == expan[0]: apparatus += expanmainenc + exenc else:#'夫=（大夫）' etc. apparatus += exenc + expanmainenc #finishing apparatus and pushing to apparatus += tags['expane'] + tags['choicee'] addnewapp(self.completeapparatus(apparatus),2) def appdupstretchover(self,m): self.dupstretchover = (self.slipno,self.glyphcount,m.group(0)) for n in ((1,2,3),(4,5,6)): glyph = m.group(n[0]) if m.group(n[1]): typ = 'context' else: typ = 'original' if m.group(n[2]): cert = 'low' else: cert = 'high' newtext = glyphs.encodeglyph(glyph,typ,cert) addnewtext(newtext) addglyphcount(newtext) def appdupundetected(self,m): sys.exit('undetected duplication mark in slip {}\n{}'.format(self.slipno,m.group(0))) #3.3.2.1. Pooling regex and functions in list Loop looplist = [ #native tags (re.compile(r'^<[^>]+>'),native_tags), #duplication mark expressions (re.compile(r'^((?:(?:｛[^｝]+｝|[^（）＆﹦])＆?(?:（？）)?﹦＆?(?:（？）)?)+)（([^）]+)）'),appduplication), #duplication mark expressions that stretch over two lines (re.compile(r'^(｛[^｝]+｝|[^（）＆﹦])(＆)?(（？）)?(﹦)(＆)?(（？）)?$'),appdupstretchover), #check for undetected duplication marks (re.compile(r'^(?:｛[^｝]+｝|[^（）＆﹦])﹦'),appdupundetected), #the original punctuation mark '┘' (re.compile(r'^(┘)(＆?)（?(？?)([。，、？！：；]?)）?'),apporipunct), #doubts on transcription ("字（？）"; "┘（？）" already processed previously） (re.compile(r'^(｛[^｝]+｝|.)(＆?)（？）'),appdoubt), #redundant characters (re.compile(r'^［([^］]+)］'),appredund), #reinterpretation of characters（"字（字）"） (re.compile(r'^(｛[^｝]+｝|.)(＆?)（([^）]+)）'),appreinterpret), #errors（"字〔字〕"） (re.compile(r'^(｛[^｝]+｝|.)(＆?)〔([^）]+)〕'),apperror), #unknown number of illegible characters (re.compile(r'^……'),appillegible), #omissions（"〖字〗"） (re.compile(r'^〖'),appomission1), (re.compile(r'^〗'),appomission2), #damage（"【…】"） (re.compile(r'^【'),appdamage1), (re.compile(r'^】'),appdamage2), #context based transcriptions（補釋=打框字） (re.compile(r'^(｛[^｝]+｝|.)＆(?:（？）)?'),appcontext), #modern punctuation marks (re.compile(r'^([。，、；：？！])'),apppunct), #single characters（non-unicode glyph="｛A＋B｝"） (re.compile(r'^(?:｛[^｝]+｝|.)'),appsingle)] #3.3.2.2. Looping over regexs and functions for regex, function in looplist: m = re.search(regex,text) if m: text = re.sub(m.group(0),'',text,1) function(self,m) self.prevglyph = m.group(0) return text sys.exit('No matching glyph found in function chopfirstglyph for slipno {}\nleft slip text is {}'.format(self.slipno,text)) #3.3.3.Completing text line def completetextline(self): self.body += tags['pe'] #3.4. Closing subsection of before every new heading def completesubsection(self): self.body += tags['dive'] #3.5. Closing all subsections of and def completeencoding(self): self.body += tags['dive'] + tags['bodye'] for n in range(len(self.back)): self.back[n] += tags['dive'] self.back[-1] += tags['backe'] + tags['texte'] dicstring = re.sub('[{}\']','',re.sub(',','\n',str(glyphs.glyphnodic))) with open('glyphsnew.txt', 'w',encoding='utf-8-sig') as f: f.write(dicstring) #3.6. Auxiliary functions used by 3.3.2. chopfirstglyph() def setapparatus(self,typ = 'range'): self.appcount += 1 apparatus = tags['app' + typ] + tags['lemb'] self.appID = appIDformat.sub('A\\g<1>','000' + str(self.appcount)) apparatus = re.sub('appID',self.appID,apparatus) return apparatus def completeapparatus(self,apparatus): return apparatus + tags['leme'] + tags['appe'] def setsupplement(self,m,typ = 'dam'): if 'A' in self.pointerID: sys.exit('Supplement within supplement for {}{} on slip {}'.format(self.prevglyph,m.group(0),self.slipno)) apparatus = self.setapparatus('left') pointer = createpointer(self.pointerID,self.glyphcount) apparatus = re.sub('pointer1',pointer, apparatus) self.apparatus = apparatus + tags['supplied' + typ + 'b'] #remember glyphcount before resetting it for glyph counting in supplement self.slipglyphcount = self.glyphcount self.glyphcount = 1 #reset pointerID for reference from supplement element to interpretations on supplement element #pointerID before resetting is identical to slipno, no need for recording self.pointerID = self.appID def completesupplement(self,m): if 'J' in self.pointerID: sys.exit('Supplement element end without beginning for {}{} on slip {}'.format(self.prevglyph,m.group(0),self.slipno)) self.apparatus += tags['suppliede'] self.back[4] += self.completeapparatus(self.apparatus) self.apparatus = '' self.pointerID = self.slipno self.glyphcount = self.slipglyphcount #2. Creator module def createTEItexts(filename): """ Reads text from textfiles line by line, discerns subtitles from main text, and feeds everything into encoding mojule. Textfile format(for 嶽麓秦簡《爲獄等狀四種》): Line without numbering = Subsection headings Line with numbering at the beginning = text of 1 slip, number = slipnumber (If textfile format changes, this function as well as the class EncodedText needs to be changed) Parameters ---------- filename : str Returns ------- encodedText : list List """ #2.1. Initiating new instance of class EncodedText et = EncodedText() #2.2. Reading text from textfiles line by line datafile = open(filename, 'r', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) #eliminating empty lines if line == '':continue #2.3. Discerning subtitles and main text, and feeding them in. m = re.search('^([^\t]+)\t(.+)$',line) if m:#main text (text + tab + slipnumber) et.addmaintext(m.group(2),m.group(1))#既存のDivオブジェクトに一つの本文行を追加する。 else:#subtitel (no tabs) if et.prevslipno: et.completesubsection()#closes

of previous subsection et.setsubsection(line) et.completeencoding()#closes last

,,, and datafile.close() return et #4 Collating module #4.1. Comparison tools (based on newdiff.py by Wang Xun) Match = _namedtuple('Match', 'a b size') class SequenceMatcher: def __init__(self, a='', b=''): self.a = self.b = None self.set_seqs(a, b) def set_seqs(self, a, b): self.set_seq1(a) self.set_seq2(b) def set_seq1(self, a): if a is self.a: return self.a = a self.matching_blocks = self.opcodes = None def set_seq2(self, b): if b is self.b: return self.b = b self.matching_blocks = self.opcodes = None self.fullbcount = None self.__chain_b() def __chain_b(self): b = self.b self.b2j = b2j = {} for i, elt in enumerate(b): indices = b2j.setdefault(elt, []) indices.append(i) def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): a, b, b2j = self.a, self.b, self.b2j if ahi is None: ahi = len(a) if bhi is None: bhi = len(b) besti, bestj, bestsize = alo, blo, 0 j2len = {} nothing = [] for i in range(alo, ahi): j2lenget = j2len.get newj2len = {} for j in b2j.get(a[i], nothing): # a[i] matches b[j] if j < blo: continue if j >= bhi: break k = newj2len[j] = j2lenget(j-1, 0) + 1 if k > bestsize: besti, bestj, bestsize = i-k+1, j-k+1, k j2len = newj2len return Match(besti, bestj, bestsize) def get_matching_blocks(self): if self.matching_blocks is not None: return self.matching_blocks la, lb = len(self.a), len(self.b) queue = [(0, la, 0, lb)] matching_blocks = [] while queue: alo, ahi, blo, bhi = queue.pop() i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) if k: # if k is 0, there was no matching block matching_blocks.append(x) if alo < i and blo < j: queue.append((alo, i, blo, j)) if i+k < ahi and j+k < bhi: queue.append((i+k, ahi, j+k, bhi)) matching_blocks.sort() i1 = j1 = k1 = 0 non_adjacent = [] for i2, j2, k2 in matching_blocks: # Is this block adjacent to i1, j1, k1? if i1 + k1 == i2 and j1 + k1 == j2: # Yes, so collapse them -- this just increases the length of # the first block by the length of the second, and the first # block so lengthened remains the block to compare against. k1 += k2 else: # Not adjacent. Remember the first block (k1==0 means it's # the dummy we started with), and make the second block the # new block to compare against. if k1: non_adjacent.append((i1, j1, k1)) i1, j1, k1 = i2, j2, k2 if k1: non_adjacent.append((i1, j1, k1)) non_adjacent.append( (la, lb, 0) ) self.matching_blocks = list(map(Match._make, non_adjacent)) return self.matching_blocks def get_opcodes(self): if self.opcodes is not None: return self.opcodes i = j = 0 self.opcodes = answer = [] for ai, bj, size in self.get_matching_blocks(): tag = '' if i < ai and j < bj: tag = 'replace' elif i < ai: tag = 'delete' elif j < bj: tag = 'insert' if tag: answer.append( (tag, i, ai, j, bj) ) i, j = ai+size, bj+size if size: answer.append( ('equal', ai, i, bj, j) ) return answer def get_grouped_opcodes(self, n=3): codes = self.get_opcodes() if not codes: codes = [("equal", 0, 1, 0, 1)] if codes[0][0] == 'equal': tag, i1, i2, j1, j2 = codes[0] codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 if codes[-1][0] == 'equal': tag, i1, i2, j1, j2 = codes[-1] codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) nn = n + n group = [] for tag, i1, i2, j1, j2 in codes: # End the current group and start a new one whenever # there is a large range with no changes. if tag == 'equal' and i2-i1 > nn: group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) yield group group = [] i1, j1 = max(i1, i2-n), max(j1, j2-n) group.append((tag, i1, i2, j1 ,j2)) if group and not (len(group)==1 and group[0][0] == 'equal'): yield group #4.2. Collating texts def collateTEItexts(xml_Texts): def idcountcheck(xmlobject,tag,idfirst): idcount = max(tag_element['xml:id'] for tag_element in xmlobject.find_all(tag)) idcount = re.sub(idfirst,'',idcount) idcount = int(idcount) + 1 return idcount def collationappcreator(app,pointers,appID,sourcecount): for n in range(len(pointers)) : pointer = 'pointer' + str(n+1) app = re.sub(pointer,pointers[n],app) app = re.sub('appID',appID,app) app += tags['rdgb'] app = re.sub('sourceID',sourcecount,app) app += tags['rdge'] + tags['appe'] app = BeautifulSoup(app,'xml') return app def collation1rdgcreator(app,nvslipnolist,lvslipnolist): tag = app.rdg for slipno in nvslipnolist: if slipno in lvslipnolist: newtag = re.sub('copyID',slipno,tags['pcopy']) newtag = BeautifulSoup(newtag,'xml') tag.append(newtag) else: newtag = nv.find('p', attrs={"xml:id": slipno}) tag.append(newtag) return app def collation2rdgcreator(app,nvglyphs,i1,i2): tag = app.rdg for n in range(i1,i2): newtag = nvglyphs[n] tag.append(newtag) return app #4.2.1. Taking first version as lemmatized version and adding source information lv = xml_Texts[0] for p_element in lv.find_all('p'): p_element['source'] = '01' #4.2.2 Comparing slip numbers of not lemmatized versions with lemmatized version lvslipnolist = attrlistup(lv,'p','xml:id') lvappcount = idcountcheck(lv,'app','A')#Checking the highest apparatus ID and incrementing it by 1. Will be used when creating collation apparatus. for n in range(1,len(xml_Texts)): nv = xml_Texts[n] sourcecount = '0' + str(n+1) #4.2.2.1 Comparing slip numbers nvslipnolist = attrlistup(nv,'p','xml:id') s = SequenceMatcher(nvslipnolist,lvslipnolist) for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'equal': for m in range(j1, j2): tag = lv.find('p', attrs={"xml:id": lvslipnolist[m]}) #tag = lv.find('p', 'xml:id'= lvslipnolist[m]) will cause exemption because 'xml:id' collides with python syntax. tag['source'] += ' ' + sourcecount elif tag == 'replace': app = tags['apprange'] pointers = ['left(' + lvslipnolist[j1] + ')','right(' + lvslipnolist[j2-1] + ')'] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist) tag = lv.find('div', attrs= {"type":"簡序"}) tag.append(app) tag.append('\n') elif tag == 'insert': #slipnumbers not included in nslipnolist app = tags['apprange'] pointers = ['left(' + lvslipnolist[j1] + ')','right(' + lvslipnolist[j2-1] + ')'] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist) tag = lv.find('div', attrs= {"type":"簡序"}) tag.append(app) tag.append('\n') elif tag == 'delete': app = tags['appleft'] pointers = [lvslipnolist[j1]] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist) tag = lv.find('div', attrs= {"type":"簡序"}) tag.append(app) tag.append('\n') #4.2.2.2 Comparing slip text for slipno in lvslipnolist: if slipno in nvslipnolist: lvptag, nvptag = lv.find('p', attrs={"xml:id":slipno}), nv.find('p', attrs={"xml:id":slipno}) lvglyphlist, nvglyphlist = attrlistup(lvptag,'g','ref'), attrlistup(nvptag,'g','ref') nvglyphs = nvptag.find_all('g') s = SequenceMatcher(nvglyphlist,lvglyphlist) for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'equal': pass elif tag == 'replace': app = tags['apprange'] pointers = [createpointer(slipno,j1+1),createpointer(slipno,j2)] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation2rdgcreator(app,nvglyphs,i1,i2) tag = lv.find('div', attrs= {"type":"正文比較"}) tag.append(app) tag.append('\n') elif tag == 'insert':#glyphs not included in nvglyphlist app = tags['apprange'] pointers = [createpointer(slipno,j1+1),createpointer(slipno,j2)] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation2rdgcreator(app,nvglyphs,i1,i2) tag = lv.find('div', attrs= {"type":"正文比較"}) tag.append(app) tag.append('\n') elif tag == 'delete': if j1 < len(lvglyphlist): app = tags['appleft'] pointers = [createpointer(slipno,j1+1)] else: app = tags['appright'] pointers = [createpointer(slipno,j1)] appID = 'A' + str(lvappcount) lvappcount += 1 app = collationappcreator(app,pointers,appID,sourcecount) app = collation2rdgcreator(app,nvglyphs,i1,i2) tag = lv.find('div', attrs= {"type":"正文比較"}) tag.append(app) tag.append('\n') return lv #1. Main body (continue) if __name__ == "__main__": #1.2. Reading input files, creating TEI encoded texts and converting them into XML files. encodedTexts = []#List of encoded texts for n in range(1,100): m = '0' + str(n) filename = 'input' + m[-2:] + '.txt' if os.path.isfile(filename): encodedTexts.append(createTEItexts(filename)) else: print ('Read {} files'.format(str(n-1))) if n == 1: sys.exit('input file not found') else: break xml_Texts = createxml(encodedTexts) #1.3. Outputting encoded texts for n in range(len(encodedTexts)): outputtextbody = encodedTexts[n].body outputtextback = encodedTexts[n].back m = '0' + str(n+1) filename = 'output' + m[-2:] + '.xml' outputfile = open(filename, 'w', encoding='utf-8-sig') outputfile.write(outputtextbody) for m in range(len(outputtextback)): outputfile.write(outputtextback[m]) outputfile.close() #1.4. Collating encoded texts lv = collateTEItexts(xml_Texts) #1.5. Outputting results outputfile = open('output(collated).xml', 'w', encoding='utf-8-sig') with open("TeiHeader(part1).txt", 'r', encoding='utf-8-sig') as file: header = file.read() outputfile.write(header) glyphs.glyphoutput(outputfile) with open("TeiHeader(part2).txt", 'r', encoding='utf-8-sig') as file: header = file.read() outputfile.write(header) outputfile.write(re.sub('^<[^>]+>\n','',str(lv)) + tags['TEIe']) outputfile.close() outputfile = open('output(nonunicode).txt', 'w', encoding='utf-8-sig') glyphs.nonunicodeoutput(outputfile) outputfile.close()