# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 10:07:13 2024

@author: Arnd Helmut Hafner

input method for indexdata02.py, inputting index data
"""

import sys # for sys.exit after error
import re
from re import compile, search
import tools

#General input-related functions
def nondataline(conditions, line):
    '''Checks if a data line doesn't contain substantial data.
    meant to exclude headlines and page number lines etc.
    
    
    Parameters
    ----------
    conditions : List
        A list of regex that hit non-data-related lines. Differs from index to index
    line : str
        one data line

    Returns
    -------
    True/False
        True = unrelated line

    '''
    for i in range(len(conditions)):
        if re.search(conditions[i],line):return('true')
    return('')


def headwordcheck(condition,line):
    '''Checks if line contains headword. Returns the word if so.

    Parameters
    ----------
    condition : str
        Contains regex to recognize headlines and return headwords
        Can only hit exactly after excluding non-related lines with nondataline
    line : str
        index data line

    Returns
    -------
    Headwords

    '''
    m = re.search(condition, line)
    if m:return(m.group(1))
    return('')

def slipnotecheck(string):
    '''Checks whether slip number or string has a note in form of '*' or '#' attached.
    Returns note separated from number

    Parameters
    ----------
    string : Str
        Eventually entailing a '*' or '#' as a remark

    Returns
    -------
    string : Str
    slipnote : Str

    '''
    slipnote = ''
    if '*' in string:
        string = re.sub('\*','',string)
        slipnote = '*'
    elif '#' in  string:
        string = re.sub('#','',string)
        slipnote = '#'
    return(string,slipnote)

#Input related functions for personal name index
def persnamestandard(headword):
    '''Creates headword and its standardized form,
    and returns them in form of a tuple

    Parameters
    ----------
    headword : str
        Same as persname.
    regex : re.compile('(｛[^｝]+｝|.)（(.)？?）')
        

    Returns
    -------
    None.

    '''
    #Regex to check the persname structure and return standardized form
    #Hits "字（字）" and "｛字＋字｝（字）", and returns character in round brackets
    regex = compile('(｛[^｝]+｝|.)（(.)？?）')
    if '（' in headword:
        headst = regex.sub('\g<2>',headword)
    elif '【' in headword:
        headst = headword
        headword = re.sub('【……】','〼',headword)
        #〼   :expression for broken slips in original text
        #'【……】':standardized expression for broken slips
    else:
        headst = headword
    return(headword,headst)


def persnameitems(line):
    '''Analyses data line and returns results as Match object.
    Data lines in persname index have two different patterns.
    Pattern1:
        Headword,space,slipnumber,space,notes1,space,notes2,space,notes3
    Pattern2:
        Slipnumber,space,notes1,space,notes2,space,notes3

    Parameters
    ----------
    line : str
        Data line

    Returns
    -------
    m : match object
        In both cases, m.group(1)=slipnumber,m.group(2~4) = notes

    '''
    m = re.search('^[^ \dJ]+ ([^ ]+) *([^ ]*) *([^ ]*) *([^ ]*)$', line)
    if m:return(m)
    return(re.search('([^ ]+) *([^ ]*) *([^ ]*) *([^ ]*)$', line))
    
    
def persnameinput(lemmata,lemmatano,items,ID ='persname',filename = 'indexdata(personal names).txt'):
    '''Reads data of personal name index
    
    Parameters
    ----------
    lemmata : list
        [(headword,headst,ID)]
        (persname index doesn't discern lemmata and strings!)
    lemmatano : int
        counting the number of lemmata, will be stored in items
    items : list
        [(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)]
    ID : str
        sort of index
        The default is 'persname'.
    filename : str
        Name of file containing personal name index data
        The default is 'indexdata(personal names).txt'.

    Returns
    -------
    lemmata : list
        list of headwords
    lemmatano: int
        number of accumulated lemmata
    items : list
        list of index items

    '''
    
    datafile = open(filename, 'r', encoding='utf-8-sig')
    for line in datafile:
        line = re.sub('\n','',line)
        #Excluding non-data-related lines
        if nondataline([' →','（続き','^[\.<]','★'],line): continue#'★' doesn't exist in the original data. It is used to manually eliminate individual records from being processed.
        #Extracting headwords
        persname = headwordcheck('^([^ \dJ]+)',line)#Checks if line contains personal name. Returns the name if so.
        if persname:
            lemmatano +=1
            stringori, stringreg = persnamestandard(persname)#Creates strings out of headword, and returns them in form of a tuple
            #Adding data to lemmata
            lemmata.append((stringreg,stringreg,ID))
            #Tests if line contains other data and proceeds to next data line if not
            if re.search('^[^\d]+$', line):continue
        #Extracting items
        m = persnameitems(line)#Analyses data line and returns results as Match object
        if m:
            slipnumber, slipnote = slipnotecheck(m.group(1))#Checks whether slip numbers have a note in form of '*' or '#' attached
            notes = m.group(2) + m.group(3) + m.group(4)
            items.append((stringori,stringreg,lemmatano,slipnumber, slipnote,notes))
        else:
            sys.exit('mismatch in persname:' + line)
    datafile.close()
    return lemmata,lemmatano,items
        

#Input-related functions for official name and other indices
def officialaoitems(line):
    '''Analyses data line and returns strings and slip numbers for official name and other indices
    Structure of data lines:
        stringreg,tab,slipnumber,tab,stringori
        (If stringreg = stringori, stringori is omitted)

    Parameters
    ----------
    line : str

    Returns
    -------
    (stringreg,slipnumber,stringori)

    '''
    m = re.search('^([^\t→←]+)\t([^\t]+)\t*([^\t]*)$', line)#[^→←]excludes reference items
    if not(m):return('','','')
    if m.group(3):return(m.group(1),m.group(2),m.group(3))#(stringreg,slipnumber,stringori)
    return(m.group(1),m.group(2),m.group(1))#stringreg=stringori

def officialaorefitems(line):
    #行の書式を確認しつつ索引の参照項目を抽出する。
    #行頭の矢印を除けば、書式は通常の個別項目と同じだから、矢印の有無を確認した上で、ファンクションkomokuchushutuで実際に項目を抽出する
    m = re.search('^[←→]',line)
    if m:return(officialaoitems(re.sub('[←→]','',line)))
    return('','','')
    
    
def officialandotherinput(lemmata,lemmatano,items,refitems,ID,filename):
    '''Reads data of the three reference items entailing indices:
    official name index, rank index, labor related term index

    Parameters
    ----------
    lemmata : list
        [(headword,headst,ID)]
    lemmatano : int
        counting the number of lemmata, will be stored in items
    items : list
        [(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)]
    refitems: list
        [(stringori,stringreg,lemmatasu,slipnumber,slipnote,notes)]
    ID : 
    filename : str
        Name of file containing personal name index data
        The default is 'indexdata(personal names).txt'.

    Returns
    -------
    lemmata : list
        list of headwords
    lemmatano: int
        number of accumulated lemmata
    items : list
        list of index items
    refitems : List
        list of reference items

    '''
    
    notes = ''#These indices don't entail notes
    
    datafile = open(filename, 'r', encoding='utf-8-sig')
    for line in datafile:
        line = re.sub('\n','',line)
        #Excluding non-data-related lines
        if nondataline([' →','^[\.<]','★'],line): continue
        
        #Extracting headwords
        headword = headwordcheck('^([^\d]+)$',line)#In these three indices, headwords appear only in separate headlines without slip numbers.
        if headword:
            lemmatano += 1
            headst = re.sub('（.+$','',headword)#headwords entail explanatory information in round brackets like 安陽（県）
            #Adding data to lemmata
            lemmata.append((headword,headst,ID))
            continue

        #Extracting items
        stringreg,slipnumber,stringori = officialaoitems(line)
        if stringreg:
            slipnumber, slipnote = slipnotecheck(slipnumber)
            items.append((stringori,stringreg,lemmatano,slipnumber, slipnote,notes))
            continue
        
        #Extracting reference items
        stringreg,slipnumber,stringori = officialaorefitems(line)
        if stringreg:
            slipnumber, slipnote = slipnotecheck(slipnumber)
            refitems.append((stringori,stringreg,lemmatano,slipnumber,slipnote,notes))
            continue

        else:sys.exit('mismatch in officialandotherinput:' + line)
    datafile.close()
    return lemmata,lemmatano,items,refitems
        
#Input-related functions for place name index
def pllinebreakdown(line):
    '''Extracts headwords and items from place name index
    Data line structure:
        headword,prop,slipnumberlist,note1,note2
        (prop = property of place name, e.g. province, prefecture etc.)
    
    Parameters
    ----------
    line : str
        Index data line

    Returns
    -------
    headword,ID,slipnumberlist,note1,note2

    '''
    m = re.search('^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$', line)
    if m:return(m.group(1),m.group(2),m.group(3),m.group(4),m.group(5))
    else:sys.exit('mismatch in place name index data line\n{}'.format(line))

def plstringformatting(headst,stringori):
    '''Formats the place name strings.
    Place name index preserves original strings in two ways.
    1)The most common version is preserved in "AB" of the headword format "AB(CD)".(CD=standardized form)
    2)Irregular versions are put in round brackets behind the slip numbers
    Irregular versions behind slip numbers can have different formats
    2-1)AB,corresponding to 1)
    (This function changes "AB(CD)"  into "A(B)C(D)" which is the common format)
    2-2)C, omiting a character (This function adds the omitted character in round brackets)
    

    Parameters
    ----------
    headst : str
        見出し語の正規化表記
    stringori : str
        文字列の原文表記

    Returns
    -------
    stringori,stringreg

    '''
    if len(headst) == len(stringori):#turns AB（CD） into A（C）B（D）
        stringreg = headst
        tempstringori = ''
        for i in range(len(headst)):
            if stringori[i] == headst[i]:
                tempstringori += stringori[i]
            else:
                tempstringori += stringori[i] + '（' + headst[i] + '）'
        stringori = tempstringori
    elif len(headst) == len(re.sub('[</box>]','',stringori)) or len(headst) == len(re.sub('（？）','',stringori)):
        stringreg = headst
    elif len(headst) > len(stringori):
        stringreg = ''
        for i in range(len(headst)):
            if headst[i] in stringori:
                stringreg += headst[i]
            else:
                stringreg += '（' +headst[i] + '）'
    elif len(headst) < len(stringori):#string includes non-unicode characters
        stringreg = headst
        tempstringori = ''
        charlist = re.findall('｛[^｝]+｝',stringori)#storing all non-unicode characters
        for n in range(len(charlist)):
            stringori = re.sub(charlist[n],str(n),stringori,1)
        if len(headst) == len(stringori):
            stringreg,stringori = plstringformatting(headst,stringori)
        else:
            sys.exit('unequal length in stringorikeishiki\n' + headst + '\t' + stringori)
        for n in range(len(charlist)):
            stringori = re.sub(str(n),charlist[n],stringori,1)
    return(stringreg,stringori)

def plheadwordprocessing(headword,prop):
    '''Processes the headwords of name place index
    and brings them into similar shape as other indices.
    Strings = Headwords → Headwords incorporate stringori and stringreg
    

    Parameters
    ----------
    headword : str
    prop : str
        
    Returns
    -------
    headword : str
    headst : str
        standardized form of headword.
    stringori : str

    '''
    m = re.search('([^（]+)（([^）]+)）', headword)
    if m:
        headst = m.group(2)
        stringori = m.group(1)
        headst,stringori = plstringformatting(headst,stringori)
    else:
        headst = headword
        stringori = headword
    headst = re.sub('〼','【……】',headst)
    headword = headst + '（' + prop + '）'
    return (headword,headst,stringori)


def plstringcreation(headst,stringori,slipnumber):
    m = re.search('^([^（]+)（(([^）]|（？）)+)）$',slipnumber)
    if m :
        slipnumber = m.group(1)
        stringori = m.group(2)
        if '〼' not in stringori:stringreg, stringori = plstringformatting(headst, stringori)
        else : stringreg = headst
    else : stringreg = headst
    #地名索引では、もともと原文と正規表記の区別がなかったが、、タグ付けにおいて必要となるため追記した。
    return(stringreg,stringori,slipnumber)

    
def plmultipleappearance(slipnumber):
    '''簡番号の後ろに出現回数が注記されているかを確認する。
    地名索引には元は出現回数が注記されていないが、　釈文のタグ付けに必要になるから、
    タグ付けにおける誤作動を手掛かりに手動で追記した。　追記の書式は次の通り
    [kanslipnumber kaisu]（つまり、簡番号の後ろに半角のスペースを置いて数字で回数を表す。）
    回数が一回の場合追記を省略する
    
    

    Parameters
    ----------
    slipnumber : str
        
    Returns
    -------
    bagno : str
        回数に関する追記は削除済み
    kaisu : int
        出現回数
    '''
    m = re.search('^([^ ]+) (\d+)',slipnumber)
    if m:
        slipnumber = m.group(1)
        kaisu = int(m.group(2))
    else:kaisu = 1
    return(slipnumber,kaisu)

def placenameinput(lemmata,lemmatano,items,ID,filename):
    ''' Reads data of place name index

    Parameters
    ----------
    lemmata : list
        [(headword,headst,ID)]
    lemmatano : int
        counting the number of lemmata, will be stored in items
    items : list
        [(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)]
    ID : str
        sort of index
    filename : str
        Name of file containing personal name index data
        The default is 'indexdata(personal names).txt'.

    Returns
    -------
    lemmata : list
        list of headwords
    lemmatano: int
        number of accumulated lemmata
    items : list
        list of index items
    chimeinotes : 
    '''
    notes =''##The place name index doesn't have ordinary notes
    plnotes = []#acculumating place name notes together with headst

    datafile = open(filename, 'r', encoding='utf-8-sig')
    logfile = open('output(placename overlapping with official names).txt','w', encoding='utf-8-sig')
    for line in datafile:
        line = re.sub('\n','',line)
        #Excluding non-data-related lines
        if nondataline(['→','^[\.<]','★'],line): continue
        #Extracting headwords and items
        headword,prop,slipnumberlist,note1,note2 = pllinebreakdown(line)
        #Excluding data lines that overlap with official name index. Storing data in logfile
        if re.search('^[郷県道亭郡津国関郵]',prop):
            logfile.write('{}\t{}\t{}\t{}\n{}\n'.format(headword,prop,note1,note2,slipnumberlist))
            continue
        
        lemmatano += 1
        #Processing headwords
        headword,headst,stringori = plheadwordprocessing(headword,prop)
        #Adding data to lemmata
        lemmata.append((headword,headst,ID))
        #Adding data peculiar to place name index to plnotes
        plnotes.append((headword,note1,note2))
    
        #Breaking down slipnumberlist and creating strings for each number
        stringreg = headst
        tempstringori = stringori#remembering value of stringori temporarily
        slipnumberlist = slipnumberlist.split(',')
        for slipnumber in slipnumberlist:
            #Checking whether multiple appearances are noted behind slip number
            slipnumber,multiple = plmultipleappearance(slipnumber)
            #Checking slipnotes
            slipnumber,slipnote = slipnotecheck(slipnumber)
            #Checking whether irregular original writings are noted behind slip number
            #(It is confirmed that there are no cases of coincidence of slipnumber notes and notes about irregular writings)
            stringreg,stringori,slipnumber = plstringcreation(headst,tempstringori,slipnumber)
            #adding items
            for n in range(multiple):
                items.append((stringori,stringreg,lemmatano,slipnumber,slipnote,notes))
    datafile.close()
    return lemmata,lemmatano,items,plnotes

def sliptextinput(filename):
    '''
    reads the original slip text and returns it as a dictionary accessible by slip number

    Parameters
    ----------
    filename : str
        Structure of file (One slip, two or more lines):
            docstyle,tab,posinstyle,tab,posindoc,tab,slipnumber
            sliptext
            sliptext
            ...
        docstyle:document style
        posinstyle: position of document within a group of same docstyle
        posindoc: position of slip within the same document

    Returns
    -------
    sliptext : dict
        {slipnumber:[sliptext,docstyle,posinstyle,posindoc]}
    slipnolist : list
        Remembers slip numbers in order of appearance
        in order to check with original file

    '''
    sliptext = {}
    datafile = open(filename, 'r', encoding='utf-8-sig')
    slipno = ''
    slipnolist = []
    for line in datafile:
        line = re.sub('\n','',line)
        m =re.search('^([\d\-]+)\t(\d+)\t(\d+)\t([\d\-+abcdJ⑯⑦⑨⑫⑬⑭]+)$',line)
        if m:#Hits a line with metadata concerning docstyle etc.
            slipno = m.group(4)
            slipnolist.append(slipno)
            sliptext[slipno] = ['',m.group(1),m.group(2),m.group(3)]
        else:#Hits lines with slip text. In case of multiline texts, linebreak '#' is inserted at end of each line.
            if sliptext[slipno][0]: sliptext[slipno][0] += '#'
            sliptext[slipno][0] += line
    #最後の簡についても、釈文の末尾の改行記号'#'を削除する
    sliptext[slipno][0] = re.sub('#$','',sliptext[slipno][0])
    return(sliptext,slipnolist)

def sliptextcheckoutput(sliptext,slipnolist,filename):
    dataoutput = open(filename, 'w', encoding='utf-8-sig')
    for slipno in slipnolist:
        tempstring =re.sub('#','\n',sliptext[slipno][0])
        dataoutput.write('{}\t{}\t{}\t{}\n{}\n'.format(sliptext[slipno][1],sliptext[slipno][2],sliptext[slipno][3],slipno,tempstring))
    dataoutput.close()
    print('outputed {} items to {}'.format(len(slipnolist),filename))

def manualtagginginput(filename):
    '''
    Reads manually tagged strings.
    Manually tagging strings that couldn't be reconstructed according
    to original strings in dataprocessing.stringprepremoval
    (log(tagremovalmanualcorrectionneeded.txt)

    Parameters
    ----------
    filename : str

    Returns
    -------
    manualtagging : dic
        {slipno+stringori:tagstring}

    '''
    manualtagging = {}
    linenumber = 0
    datafile = open(filename, 'r', encoding='utf-8-sig')
    for line in datafile:
        linenumber += 1
        line = re.sub('\n','',line)
        m =re.search('^([^\t]+)\t([^\t]+)\t([^\t]+)$',line)
        if m:manualtagging[m.group(1)+m.group(2)] = m.group(3)
        else:sys.exit('mismatch in manualtagginginput line {}\nline is {}'
                      .format(linenumber,line))
    datafile.close()
    return(manualtagging)

#Input-related functions for historical date index
def readdates(filenamedata = 'indexdata(dates).txt',filenamelog = 'log(reading histdates).txt'):
    '''
    Reads data of historical date index

    Parameters
    ----------
    filenamedata : str, optional
        Name of datafile. The default is 'indexdata(dates).txt'.
        Data structure:
            stringori,tab,stringreg,tab,slipnumber,tab,IDchr,tab,IDBCE
            IDchr: nnnnnnnn(nnn=year,nnn=month,nn=day)
            year: 101=秦王政元年...125=秦王政二十五年;126=秦始皇二十六年...137=秦始皇三十七年
                201=秦二世元年
            month: 010=十月,011=十一月,012=十二月,101=正月,102=二月...109=九月,119=後九月
            unknownyears: 999=□□年;193/192=卅□年/廿□年;992/995=□二年/□五年
            unknownmonths: 999=□(□)月;019=十□月
    filenamelog : str, optional
        Name of logfile. The default is 'log(reading histdates).txt'

    Returns
    -------
    histdates : dic
        {slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote]]}

    '''
    histdates = {}
    linenumber = 0
    datafile = open(filenamedata, 'r', encoding='utf-8-sig')
    logfile = open(filenamelog, 'w', encoding='utf-8-sig')
    for line in datafile:
        linenumber += 1
        m =re.search('^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$',line)
        if m: stringori,stringreg,slipno,IDchr,IDBCE=m.group(1),m.group(2),m.group(3),m.group(4),m.group(5)
        else:sys.exit('mismatch inreaddates line {}\nline is {}'
                      .format(linenumber,line))
        slipnote = ''
        if '*' in stringori:
            stringori = re.sub('\*','',stringori)
            slipnote = '*'
        stringreg = re.sub('\*','',stringreg)
        if slipno in histdates.keys():
            exist = False
            for n in range(len(histdates[slipno])):
                if stringori == histdates[slipno][n][0]:
                    histdates[slipno][n][1] += 1
                    exist = True
                    if stringreg != histdates[slipno][n][2]:
                        logfile.write('inconsistency in histdates\n{}\n{}'
                            '\n\n'.format(histdates[slipno][n],[stringori,1,stringreg,slipno,IDchr,IDBCE]))
                    break
            if not exist:
                histdates[slipno].append([stringori,1,stringreg,slipno,IDchr,IDBCE,slipnote])
        else:
            histdates[slipno] = [[ stringori,1,stringreg,slipno,IDchr,IDBCE,slipnote]]
    datafile.close()
   
    #Sorting histdates in declining order of length of stringori
    for slipno in histdates.keys():
        histdates[slipno] = sorted(histdates[slipno] , key=lambda x: len(re.findall('.',x[0]))*-1)
    return(histdates)

    
if __name__ == "__main__":
    zero = 0