of previous subsection
et.setsubsection(line)
et.completeencoding()#closes last
,,, and
datafile.close()
return et
#4 Collating module
#4.1. Comparison tools (based on newdiff.py by Wang Xun)
Match = _namedtuple('Match', 'a b size')
class SequenceMatcher:
def __init__(self, a='', b=''):
self.a = self.b = None
self.set_seqs(a, b)
def set_seqs(self, a, b):
self.set_seq1(a)
self.set_seq2(b)
def set_seq1(self, a):
if a is self.a:
return
self.a = a
self.matching_blocks = self.opcodes = None
def set_seq2(self, b):
if b is self.b:
return
self.b = b
self.matching_blocks = self.opcodes = None
self.fullbcount = None
self.__chain_b()
def __chain_b(self):
b = self.b
self.b2j = b2j = {}
for i, elt in enumerate(b):
indices = b2j.setdefault(elt, [])
indices.append(i)
def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
a, b, b2j = self.a, self.b, self.b2j
if ahi is None:
ahi = len(a)
if bhi is None:
bhi = len(b)
besti, bestj, bestsize = alo, blo, 0
j2len = {}
nothing = []
for i in range(alo, ahi):
j2lenget = j2len.get
newj2len = {}
for j in b2j.get(a[i], nothing):
# a[i] matches b[j]
if j < blo:
continue
if j >= bhi:
break
k = newj2len[j] = j2lenget(j-1, 0) + 1
if k > bestsize:
besti, bestj, bestsize = i-k+1, j-k+1, k
j2len = newj2len
return Match(besti, bestj, bestsize)
def get_matching_blocks(self):
if self.matching_blocks is not None:
return self.matching_blocks
la, lb = len(self.a), len(self.b)
queue = [(0, la, 0, lb)]
matching_blocks = []
while queue:
alo, ahi, blo, bhi = queue.pop()
i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
if k: # if k is 0, there was no matching block
matching_blocks.append(x)
if alo < i and blo < j:
queue.append((alo, i, blo, j))
if i+k < ahi and j+k < bhi:
queue.append((i+k, ahi, j+k, bhi))
matching_blocks.sort()
i1 = j1 = k1 = 0
non_adjacent = []
for i2, j2, k2 in matching_blocks:
# Is this block adjacent to i1, j1, k1?
if i1 + k1 == i2 and j1 + k1 == j2:
# Yes, so collapse them -- this just increases the length of
# the first block by the length of the second, and the first
# block so lengthened remains the block to compare against.
k1 += k2
else:
# Not adjacent. Remember the first block (k1==0 means it's
# the dummy we started with), and make the second block the
# new block to compare against.
if k1:
non_adjacent.append((i1, j1, k1))
i1, j1, k1 = i2, j2, k2
if k1:
non_adjacent.append((i1, j1, k1))
non_adjacent.append( (la, lb, 0) )
self.matching_blocks = list(map(Match._make, non_adjacent))
return self.matching_blocks
def get_opcodes(self):
if self.opcodes is not None:
return self.opcodes
i = j = 0
self.opcodes = answer = []
for ai, bj, size in self.get_matching_blocks():
tag = ''
if i < ai and j < bj:
tag = 'replace'
elif i < ai:
tag = 'delete'
elif j < bj:
tag = 'insert'
if tag:
answer.append( (tag, i, ai, j, bj) )
i, j = ai+size, bj+size
if size:
answer.append( ('equal', ai, i, bj, j) )
return answer
def get_grouped_opcodes(self, n=3):
codes = self.get_opcodes()
if not codes:
codes = [("equal", 0, 1, 0, 1)]
if codes[0][0] == 'equal':
tag, i1, i2, j1, j2 = codes[0]
codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
if codes[-1][0] == 'equal':
tag, i1, i2, j1, j2 = codes[-1]
codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
nn = n + n
group = []
for tag, i1, i2, j1, j2 in codes:
# End the current group and start a new one whenever
# there is a large range with no changes.
if tag == 'equal' and i2-i1 > nn:
group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
yield group
group = []
i1, j1 = max(i1, i2-n), max(j1, j2-n)
group.append((tag, i1, i2, j1 ,j2))
if group and not (len(group)==1 and group[0][0] == 'equal'):
yield group
#4.2. Collating texts
def collateTEItexts(xml_Texts):
def idcountcheck(xmlobject,tag,idfirst):
idcount = max(tag_element['xml:id'] for tag_element in xmlobject.find_all(tag))
idcount = re.sub(idfirst,'',idcount)
idcount = int(idcount) + 1
return idcount
def collationappcreator(app,pointers,appID,sourcecount):
for n in range(len(pointers)) :
pointer = 'pointer' + str(n+1)
app = re.sub(pointer,pointers[n],app)
app = re.sub('appID',appID,app)
app += tags['rdgb']
app = re.sub('sourceID',sourcecount,app)
app += tags['rdge'] + tags['appe']
app = BeautifulSoup(app,'xml')
return app
def collation1rdgcreator(app,nvslipnolist,lvslipnolist):
tag = app.rdg
for slipno in nvslipnolist:
if slipno in lvslipnolist:
newtag = re.sub('copyID',slipno,tags['pcopy'])
newtag = BeautifulSoup(newtag,'xml')
tag.append(newtag)
else:
newtag = nv.find('p', attrs={"xml:id": slipno})
tag.append(newtag)
return app
def collation2rdgcreator(app,nvglyphs,i1,i2):
tag = app.rdg
for n in range(i1,i2):
newtag = nvglyphs[n]
tag.append(newtag)
return app
#4.2.1. Taking first version as lemmatized version and adding source information
lv = xml_Texts[0]
for p_element in lv.find_all('p'):
p_element['source'] = '01'
#4.2.2 Comparing slip numbers of not lemmatized versions with lemmatized version
lvslipnolist = attrlistup(lv,'p','xml:id')
lvappcount = idcountcheck(lv,'app','A')#Checking the highest apparatus ID and incrementing it by 1. Will be used when creating collation apparatus.
for n in range(1,len(xml_Texts)):
nv = xml_Texts[n]
sourcecount = '0' + str(n+1)
#4.2.2.1 Comparing slip numbers
nvslipnolist = attrlistup(nv,'p','xml:id')
s = SequenceMatcher(nvslipnolist,lvslipnolist)
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == 'equal':
for m in range(j1, j2):
tag = lv.find('p', attrs={"xml:id": lvslipnolist[m]})
#tag = lv.find('p', 'xml:id'= lvslipnolist[m]) will cause exemption because 'xml:id' collides with python syntax.
tag['source'] += ' ' + sourcecount
elif tag == 'replace':
app = tags['apprange']
pointers = ['left(' + lvslipnolist[j1] + ')','right(' + lvslipnolist[j2-1] + ')']
appID = 'A' + str(lvappcount)
lvappcount += 1
app = collationappcreator(app,pointers,appID,sourcecount)
app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist)
tag = lv.find('div', attrs= {"type":"簡序"})
tag.append(app)
tag.append('\n')
elif tag == 'insert':
#slipnumbers not included in nslipnolist
app = tags['apprange']
pointers = ['left(' + lvslipnolist[j1] + ')','right(' + lvslipnolist[j2-1] + ')']
appID = 'A' + str(lvappcount)
lvappcount += 1
app = collationappcreator(app,pointers,appID,sourcecount)
app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist)
tag = lv.find('div', attrs= {"type":"簡序"})
tag.append(app)
tag.append('\n')
elif tag == 'delete':
app = tags['appleft']
pointers = [lvslipnolist[j1]]
appID = 'A' + str(lvappcount)
lvappcount += 1
app = collationappcreator(app,pointers,appID,sourcecount)
app = collation1rdgcreator(app,nvslipnolist[i1:i2],lvslipnolist)
tag = lv.find('div', attrs= {"type":"簡序"})
tag.append(app)
tag.append('\n')
#4.2.2.2 Comparing slip text
for slipno in lvslipnolist:
if slipno in nvslipnolist:
lvptag, nvptag = lv.find('p', attrs={"xml:id":slipno}), nv.find('p', attrs={"xml:id":slipno})
lvglyphlist, nvglyphlist = attrlistup(lvptag,'g','ref'), attrlistup(nvptag,'g','ref')
nvglyphs = nvptag.find_all('g')
s = SequenceMatcher(nvglyphlist,lvglyphlist)
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == 'equal':
pass
elif tag == 'replace':
app = tags['apprange']
pointers = [createpointer(slipno,j1+1),createpointer(slipno,j2)]
appID = 'A' + str(lvappcount)
lvappcount += 1
app = collationappcreator(app,pointers,appID,sourcecount)
app = collation2rdgcreator(app,nvglyphs,i1,i2)
tag = lv.find('div', attrs= {"type":"正文比較"})
tag.append(app)
tag.append('\n')
elif tag == 'insert':#glyphs not included in nvglyphlist
app = tags['apprange']
pointers = [createpointer(slipno,j1+1),createpointer(slipno,j2)]
appID = 'A' + str(lvappcount)
lvappcount += 1
app = collationappcreator(app,pointers,appID,sourcecount)
app = collation2rdgcreator(app,nvglyphs,i1,i2)
tag = lv.find('div', attrs= {"type":"正文比較"})
tag.append(app)
tag.append('\n')
elif tag == 'delete':
if j1 < len(lvglyphlist):
app = tags['appleft']
pointers = [createpointer(slipno,j1+1)]
else:
app = tags['appright']
pointers = [createpointer(slipno,j1)]
appID = 'A' + str(lvappcount)
lvappcount += 1
app = collationappcreator(app,pointers,appID,sourcecount)
app = collation2rdgcreator(app,nvglyphs,i1,i2)
tag = lv.find('div', attrs= {"type":"正文比較"})
tag.append(app)
tag.append('\n')
return lv
#1. Main body (continue)
if __name__ == "__main__":
#1.2. Reading input files, creating TEI encoded texts and converting them into XML files.
encodedTexts = []#List of encoded texts
for n in range(1,100):
m = '0' + str(n)
filename = 'input' + m[-2:] + '.txt'
if os.path.isfile(filename):
encodedTexts.append(createTEItexts(filename))
else:
print ('Read {} files'.format(str(n-1)))
if n == 1:
sys.exit('input file not found')
else:
break
xml_Texts = createxml(encodedTexts)
#1.3. Outputting encoded texts
for n in range(len(encodedTexts)):
outputtextbody = encodedTexts[n].body
outputtextback = encodedTexts[n].back
m = '0' + str(n+1)
filename = 'output' + m[-2:] + '.xml'
outputfile = open(filename, 'w', encoding='utf-8-sig')
outputfile.write(outputtextbody)
for m in range(len(outputtextback)):
outputfile.write(outputtextback[m])
outputfile.close()
#1.4. Collating encoded texts
lv = collateTEItexts(xml_Texts)
#1.5. Outputting results
outputfile = open('output(collated).xml', 'w', encoding='utf-8-sig')
with open("TeiHeader(part1).txt", 'r', encoding='utf-8-sig') as file:
header = file.read()
outputfile.write(header)
glyphs.glyphoutput(outputfile)
with open("TeiHeader(part2).txt", 'r', encoding='utf-8-sig') as file:
header = file.read()
outputfile.write(header)
outputfile.write(re.sub('^<[^>]+>\n','',str(lv)) + tags['TEIe'])
outputfile.close()
outputfile = open('output(nonunicode).txt', 'w', encoding='utf-8-sig')
glyphs.nonunicodeoutput(outputfile)
outputfile.close()