172 lines
6.8 KiB
Python
172 lines
6.8 KiB
Python
|
import json
|
||
|
import os
|
||
|
import pathlib
|
||
|
import random
|
||
|
import sys
|
||
|
import argparse
|
||
|
|
||
|
from alive_progress import alive_bar
|
||
|
import ijson
|
||
|
import pydub
|
||
|
|
||
|
#argument parser
|
||
|
parser = argparse.ArgumentParser(description='json file path')
|
||
|
parser.add_argument('--path',dest='filePath',type=str,help='target json file path')
|
||
|
parser.add_argument('--language',dest='language',type=str,help='target language')
|
||
|
parser.add_argument('--character',dest='character',type=str,help='target character')
|
||
|
args = parser.parse_args()
|
||
|
#test argument
|
||
|
args.filePath = './test.json'
|
||
|
args.language = 'CHS'
|
||
|
args.character = '甘雨'
|
||
|
#
|
||
|
filePath = str(args.filePath)
|
||
|
targetLanguage = str(args.language)
|
||
|
targetCharacter = str(args.character)
|
||
|
|
||
|
# todo : function to check if argument is valid or not
|
||
|
|
||
|
|
||
|
|
||
|
# prefixlist generator for searching for target item
|
||
|
def prefixListGenerator(filePath,targetLanguage,targetCharacter):
|
||
|
prefixlist = []
|
||
|
with open(filePath,'r',encoding='utf-8') as f:
|
||
|
parser = ijson.parse(f)
|
||
|
for prefix, event, value in parser:
|
||
|
|
||
|
if (prefix.endswith('.language') and value == targetLanguage):
|
||
|
prefixsplit = str(prefix).split('.')
|
||
|
prefixsplitedLanguage = prefixsplit[0]
|
||
|
|
||
|
if (prefix.endswith('.npcName') and value == targetCharacter):
|
||
|
prefixsplit = str(prefix).split('.')
|
||
|
prefixsplitedCharacter = prefixsplit[0]
|
||
|
if prefixsplitedLanguage == prefixsplitedCharacter :
|
||
|
prefixlist.append(prefixsplitedCharacter)
|
||
|
prefixlistLength = len(prefixlist)
|
||
|
f.close()
|
||
|
print("------ prefixlist grenerated length: "+str(prefixlistLength)+" ------")
|
||
|
return prefixlist
|
||
|
|
||
|
|
||
|
|
||
|
def resultJsonGenerator(prefixlist,filePath):
|
||
|
#print("------ result.json grenerating ------")
|
||
|
resultTemp = {}
|
||
|
workspacePath = pathlib.Path.cwd()
|
||
|
dstFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json'))
|
||
|
print('------ result.json generating ------')
|
||
|
with alive_bar(len(prefixlist)) as bar:
|
||
|
for prefixitem in prefixlist:
|
||
|
bar()
|
||
|
if not dstFilePathCharacter.parent.exists():
|
||
|
os.makedirs(dstFilePathCharacter.parent)
|
||
|
with open(filePath,'r',encoding='utf-8') as t:
|
||
|
data = ijson.items(t, prefixitem)
|
||
|
for dataItems in data :
|
||
|
#update file suffix from .wem to .wav
|
||
|
fileNameTemp = dataItems['fileName']
|
||
|
fileNameTemp = pathlib.Path(fileNameTemp).with_suffix('.wav')
|
||
|
dataItems['fileName'] = str(fileNameTemp)
|
||
|
if fileNameTemp.exists():
|
||
|
resultTemp.update({prefixitem:dataItems})
|
||
|
result = resultTemp
|
||
|
else:
|
||
|
prefixlist.remove(prefixitem)
|
||
|
print('removed prefixitem: ' + str(prefixitem)+' reason: file no found')
|
||
|
|
||
|
with open(dstFilePathCharacter,'w+',encoding='utf-8') as out:
|
||
|
resultOut=json.dumps(result,ensure_ascii=False)
|
||
|
out.write(resultOut)
|
||
|
out.flush()
|
||
|
out.close()
|
||
|
|
||
|
t.close()
|
||
|
return prefixlist
|
||
|
|
||
|
|
||
|
def wavFileClassify(srcPath,dstPath):
|
||
|
song = pydub.AudioSegment.from_file(srcPath)
|
||
|
song = song.set_channels(1)
|
||
|
song = song.set_frame_rate(22050)
|
||
|
song.export(dstPath, format='wav')
|
||
|
|
||
|
|
||
|
def datasetGenerator(prefixlist):
|
||
|
workspacePath = pathlib.Path.cwd()
|
||
|
dstFilePathdataset = pathlib.Path(workspacePath,targetCharacter,str('list.txt'))
|
||
|
dstFilePathValdataset = pathlib.Path(workspacePath,targetCharacter,str('list_val.txt'))
|
||
|
dstFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json'))
|
||
|
with open(dstFilePathdataset,'a',encoding='utf-8') as dataset:
|
||
|
with open(dstFilePathValdataset,'a',encoding='utf-8') as valDataset:
|
||
|
print('------ dataset generating ------')
|
||
|
with alive_bar(len(prefixlist)) as bar:
|
||
|
for prefixitem in prefixlist:
|
||
|
bar()
|
||
|
with open(str(dstFilePathCharacter),'r',encoding='utf-8') as resultFile:
|
||
|
data = ijson.items(resultFile, prefixitem)
|
||
|
for dataItems in data:
|
||
|
#generate training dataset list.txt
|
||
|
if 'text' in dataItems :
|
||
|
datastringPathTemp = pathlib.Path(str('wavs'),str(dataItems['fileName']))
|
||
|
datastring = str(datastringPathTemp.as_posix())+'|'+ str(dataItems['text'])+'\n'
|
||
|
dataset.write(datastring)
|
||
|
dataset.flush()
|
||
|
#generate training valdataset list_val.txt
|
||
|
if random.randint(1,10) == 1 :
|
||
|
valDataset.write(datastring)
|
||
|
valDataset.flush()
|
||
|
|
||
|
dataset.close()
|
||
|
valDataset.close()
|
||
|
resultFile.close()
|
||
|
|
||
|
# file select by filename in result.json
|
||
|
def fileSelector():
|
||
|
filePathList = []
|
||
|
workspacePath = pathlib.Path.cwd()
|
||
|
srcFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json'))
|
||
|
if os.path.exists(srcFilePathCharacter):
|
||
|
with open(str(srcFilePathCharacter),'r',encoding='utf-8') as resultFile:
|
||
|
parser = ijson.parse(resultFile)
|
||
|
for prefix, event, value in parser:
|
||
|
if (prefix.endswith('.fileName')):
|
||
|
filePathList.append(value)
|
||
|
resultFile.close()
|
||
|
print('------ classfying wav file with frame rate 22050 channels 1 ------')
|
||
|
with alive_bar(len(filePathList),) as bar:
|
||
|
for srcFilePathTemp in filePathList:
|
||
|
bar()
|
||
|
srcFilePath = pathlib.Path(workspacePath,srcFilePathTemp)
|
||
|
dstFilePath = pathlib.Path(workspacePath,targetCharacter,'wavs',srcFilePathTemp)
|
||
|
if dstFilePath.parent.exists() and srcFilePath.exists():
|
||
|
wavFileClassify(srcFilePath,dstFilePath)
|
||
|
#print("classfying "+ str(srcFilePath)+'\nto '+str(dstFilePath))
|
||
|
elif srcFilePath.exists():
|
||
|
os.makedirs(dstFilePath.parent)
|
||
|
wavFileClassify(srcFilePath,dstFilePath)
|
||
|
#print("classfying "+ str(srcFilePath)+'\nto '+str(dstFilePath))
|
||
|
|
||
|
else:
|
||
|
print('------ result.json do not exists! check for result.json generated or not ------')
|
||
|
sys.exit(0)
|
||
|
|
||
|
|
||
|
|
||
|
if __name__=="__main__":
|
||
|
|
||
|
prefixlist = prefixListGenerator(filePath,targetLanguage,targetCharacter)
|
||
|
prefixlist = resultJsonGenerator(prefixlist,filePath)
|
||
|
datasetGenerator(prefixlist)
|
||
|
fileSelector()
|
||
|
print("------ done! check result.json for what you need ------")
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|