hutaoVits/jsonDecoder.py

172 lines
6.8 KiB
Python
Raw Permalink Normal View History

import json
import os
import pathlib
import random
import sys
import argparse
from alive_progress import alive_bar
import ijson
import pydub
#argument parser
parser = argparse.ArgumentParser(description='json file path')
parser.add_argument('--path',dest='filePath',type=str,help='target json file path')
parser.add_argument('--language',dest='language',type=str,help='target language')
parser.add_argument('--character',dest='character',type=str,help='target character')
args = parser.parse_args()
#test argument
args.filePath = './test.json'
args.language = 'CHS'
args.character = '甘雨'
#
filePath = str(args.filePath)
targetLanguage = str(args.language)
targetCharacter = str(args.character)
# todo : function to check if argument is valid or not
# prefixlist generator for searching for target item
def prefixListGenerator(filePath,targetLanguage,targetCharacter):
prefixlist = []
with open(filePath,'r',encoding='utf-8') as f:
parser = ijson.parse(f)
for prefix, event, value in parser:
if (prefix.endswith('.language') and value == targetLanguage):
prefixsplit = str(prefix).split('.')
prefixsplitedLanguage = prefixsplit[0]
if (prefix.endswith('.npcName') and value == targetCharacter):
prefixsplit = str(prefix).split('.')
prefixsplitedCharacter = prefixsplit[0]
if prefixsplitedLanguage == prefixsplitedCharacter :
prefixlist.append(prefixsplitedCharacter)
prefixlistLength = len(prefixlist)
f.close()
print("------ prefixlist grenerated length: "+str(prefixlistLength)+" ------")
return prefixlist
def resultJsonGenerator(prefixlist,filePath):
#print("------ result.json grenerating ------")
resultTemp = {}
workspacePath = pathlib.Path.cwd()
dstFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json'))
print('------ result.json generating ------')
with alive_bar(len(prefixlist)) as bar:
for prefixitem in prefixlist:
bar()
if not dstFilePathCharacter.parent.exists():
os.makedirs(dstFilePathCharacter.parent)
with open(filePath,'r',encoding='utf-8') as t:
data = ijson.items(t, prefixitem)
for dataItems in data :
#update file suffix from .wem to .wav
fileNameTemp = dataItems['fileName']
fileNameTemp = pathlib.Path(fileNameTemp).with_suffix('.wav')
dataItems['fileName'] = str(fileNameTemp)
if fileNameTemp.exists():
resultTemp.update({prefixitem:dataItems})
result = resultTemp
else:
prefixlist.remove(prefixitem)
print('removed prefixitem: ' + str(prefixitem)+' reason: file no found')
with open(dstFilePathCharacter,'w+',encoding='utf-8') as out:
resultOut=json.dumps(result,ensure_ascii=False)
out.write(resultOut)
out.flush()
out.close()
t.close()
return prefixlist
def wavFileClassify(srcPath,dstPath):
song = pydub.AudioSegment.from_file(srcPath)
song = song.set_channels(1)
song = song.set_frame_rate(22050)
song.export(dstPath, format='wav')
def datasetGenerator(prefixlist):
workspacePath = pathlib.Path.cwd()
dstFilePathdataset = pathlib.Path(workspacePath,targetCharacter,str('list.txt'))
dstFilePathValdataset = pathlib.Path(workspacePath,targetCharacter,str('list_val.txt'))
dstFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json'))
with open(dstFilePathdataset,'a',encoding='utf-8') as dataset:
with open(dstFilePathValdataset,'a',encoding='utf-8') as valDataset:
print('------ dataset generating ------')
with alive_bar(len(prefixlist)) as bar:
for prefixitem in prefixlist:
bar()
with open(str(dstFilePathCharacter),'r',encoding='utf-8') as resultFile:
data = ijson.items(resultFile, prefixitem)
for dataItems in data:
#generate training dataset list.txt
if 'text' in dataItems :
datastringPathTemp = pathlib.Path(str('wavs'),str(dataItems['fileName']))
datastring = str(datastringPathTemp.as_posix())+'|'+ str(dataItems['text'])+'\n'
dataset.write(datastring)
dataset.flush()
#generate training valdataset list_val.txt
if random.randint(1,10) == 1 :
valDataset.write(datastring)
valDataset.flush()
dataset.close()
valDataset.close()
resultFile.close()
# file select by filename in result.json
def fileSelector():
filePathList = []
workspacePath = pathlib.Path.cwd()
srcFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json'))
if os.path.exists(srcFilePathCharacter):
with open(str(srcFilePathCharacter),'r',encoding='utf-8') as resultFile:
parser = ijson.parse(resultFile)
for prefix, event, value in parser:
if (prefix.endswith('.fileName')):
filePathList.append(value)
resultFile.close()
print('------ classfying wav file with frame rate 22050 channels 1 ------')
with alive_bar(len(filePathList),) as bar:
for srcFilePathTemp in filePathList:
bar()
srcFilePath = pathlib.Path(workspacePath,srcFilePathTemp)
dstFilePath = pathlib.Path(workspacePath,targetCharacter,'wavs',srcFilePathTemp)
if dstFilePath.parent.exists() and srcFilePath.exists():
wavFileClassify(srcFilePath,dstFilePath)
#print("classfying "+ str(srcFilePath)+'\nto '+str(dstFilePath))
elif srcFilePath.exists():
os.makedirs(dstFilePath.parent)
wavFileClassify(srcFilePath,dstFilePath)
#print("classfying "+ str(srcFilePath)+'\nto '+str(dstFilePath))
else:
print('------ result.json do not exists! check for result.json generated or not ------')
sys.exit(0)
if __name__=="__main__":
prefixlist = prefixListGenerator(filePath,targetLanguage,targetCharacter)
prefixlist = resultJsonGenerator(prefixlist,filePath)
datasetGenerator(prefixlist)
fileSelector()
print("------ done! check result.json for what you need ------")