# -*- coding: utf-8 -*- """ Created on Mon Sep 16 20:15:26 2024 @author: Arnd Helmut Hafner """ #Adds the original punctuation marks("┘") to the transcribtion of "爲獄等状" #Manually corrected results can be found in Igokutojo(Zenjoho).txt import sys # for sys.exit after error import re from re import compile, search def testtitle(line): """ Tests title lines for read-functions Parameters ---------- line : str Returns ------- title : Boolean. """ title = False condition = compile("^([〇一二三四五六七八九]+ )|待考殘𥳑") if re.search(condition, line): title = True return(title) def testnumber(line): """ Reads slip number of text-file with original punctuation Parameters ---------- line : str Returns ------- slipn : str """ m = re.search('[^\t]+\t([^\t]+)', line) if m:return(m.group(1)) else:sys.exit("mismatch in testnaumber for {}".format(line)) def readori(filename): """ Reads the text-file with the original punctuation marks Parameters ---------- filename : str Returns ------- textori : dict {slipnumber: line} """ slipn = "" textori = {} datafile = open(filename, 'r', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) if line == '' or testtitle(line):continue if "\t" in line: slipn = testnumber(line) continue textori[slipn] = line datafile.close() return(textori) def insertpunc(newline,oldline,slipn,logfile): templine = newline ambig = False#置換に疑問が生じた時に置換を中止してログファイルに警告を出す pm = '。,、:;?!'#句読点 hit = re.findall('.┘.',oldline) r = 0 if re.search('┘.┘',oldline): logfile.write('Slip {} might have some misrepresentations because of vincinity of punctuation marks\n'.format(slipn)) while ambig == False and r < len(hit): A,B = hit[r].split('┘') reg = compile(A + '([^' + pm + A +']*?)([' +pm + ']*)' + B) if len(re.findall(reg,newline)) > 1: logfile.write('Slip {} might have some misrepresentations because of text similarities\n'.format(slipn)) m = re.search(reg,newline) if m: okikaemae = m.group(0) if m.group(1) and m.group(2): okikaego = A + m.group(1) + '┘(' + m.group(2) + ')' + B elif m.group(2): okikaego = A + '┘(' + m.group(2) + ')' + B elif m.group(1): okikaego = A + m.group(1) + '┘' + B else: okikaego = A + '┘' + B newline = newline.replace(okikaemae, okikaego,1) r += 1 else: ambig = True if ambig : logfile.write("""Replacement for slip {} failed.\nOldline is: {}\nNewline is: {}\nReplacement result is: {}\n""".format( slipn,oldline,templine,newline)) return(templine) return(newline) def substpunct(slipn, line, textori,logfile): """ Comparing text-lines and inserting original punctuation marks Parameters ---------- slipn : str line : str textori : dict {slipnumber: line(ori)} logfile : IOWrapper Returns ------- newline : str 'slipn + \t + line' """ if slipn not in textori.keys(): logfile.write('slipnumber not found in textori:{}\n'.format(slipn)) elif '┘' in textori[slipn]: line = insertpunc(line,textori[slipn],slipn,logfile) return('{}\t{}'.format(slipn,line)) if __name__ == "__main__": #reading text-file with original punctuation textori = readori("Igokutojo(Hakubun).txt") #reading text-file with new punctuation and adding original punctuation datafile = open('Igokutojo(Zenjoho).txt', 'r', encoding='utf-8-sig') outputfile = open('output.txt','w', encoding='utf-8-sig') logfile = open('log.txt','w', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) #checking and outputing title-lines and empty lines if line == '' or testtitle(line): outputfile.write('{}\n'.format(line)) continue #discerning slipnumbers from text and substituting punctuation marks m = re.search('^([^\t]+)\t([^\t]+)$',line) if m: outputfile.write('{}\n'.format(substpunct(m.group(1),m.group(2),textori,logfile))) else: sys.exit("mismatch in discerning slipnumbers for {}".format(line)) datafile.close() outputfile.close() logfile.close()