VoiceUtil.java
3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package com.bsth.data.zndd.voice;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.sun.media.sound.WaveFileReader;
import com.sun.media.sound.WaveFileWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.Assert;
import org.springframework.util.StringUtils;
import org.vosk.LibVosk;
import org.vosk.LogLevel;
import org.vosk.Model;
import org.vosk.Recognizer;
import javax.sound.sampled.*;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
public class VoiceUtil {
/* @Value("${leenleda.vosk.model}")
private String VOSKMODELPATH;*/
Logger log = LoggerFactory.getLogger(this.getClass());
public String getWord(String filePath) throws IOException, UnsupportedAudioFileException {
Assert.isTrue(StringUtils.hasLength("D:\\pcm\\vosk-model-small-cn-0.22"), "无效的VOS模块!");
byte[] bytes = Files.readAllBytes(Paths.get(filePath));
// 转换为16KHZ
reSamplingAndSave(bytes, filePath);
File f = new File(filePath);
RandomAccessFile rdf = null;
rdf = new RandomAccessFile(f, "r");
log.info("声音尺寸:{}", toInt(read(rdf, 4, 4)));
log.info("音频格式:{}", toShort(read(rdf, 20, 2)));
short track = toShort(read(rdf, 22, 2));
log.info("1 单声道 2 双声道: {}", track);
log.info("采样率、音频采样级别 16000 = 16KHz: {}", toInt(read(rdf, 24, 4)));
log.info("每秒波形的数据量:{}", toShort(read(rdf, 22, 2)));
log.info("采样帧的大小:{}", toShort(read(rdf, 32, 2)));
log.info("采样位数:{}", toShort(read(rdf, 34, 2)));
rdf.close();
LibVosk.setLogLevel(LogLevel.WARNINGS);
try (
Model model = new Model("D:\\pcm\\vosk-model-small-cn-0.22");
InputStream ais = AudioSystem.getAudioInputStream(new BufferedInputStream(new FileInputStream(filePath)));
// 采样率为音频采样率的声道倍数
Recognizer recognizer = new Recognizer(model, 16000 * track)) {
recognizer.setWords(true);
String result = recognizer.getFinalResult();
log.info("识别结果:{}", result);
if (StringUtils.hasLength(result)) {
JSONObject jsonObject = JSON.parseObject(result);
return jsonObject.getString("text").replace(" ", "");
}
return "";
}
}
public static int toInt(byte[] b) {
return (((b[3] & 0xff) << 24) + ((b[2] & 0xff) << 16) + ((b[1] & 0xff) << 8) + ((b[0] & 0xff) << 0));
}
public static short toShort(byte[] b) {
return (short) ((b[1] << 8) + (b[0] << 0));
}
public static byte[] read(RandomAccessFile rdf, int pos, int length) throws IOException {
rdf.seek(pos);
byte result[] = new byte[length];
for (int i = 0; i < length; i++) {
result[i] = rdf.readByte();
}
return result;
}
public static void reSamplingAndSave(byte[] data, String path) throws IOException, UnsupportedAudioFileException {
WaveFileReader reader = new WaveFileReader();
AudioInputStream audioIn = reader.getAudioInputStream(new ByteArrayInputStream(data));
AudioFormat srcFormat = audioIn.getFormat();
int targetSampleRate = 16000;
AudioFormat dstFormat = new AudioFormat(srcFormat.getEncoding(),
targetSampleRate,
srcFormat.getSampleSizeInBits(),
srcFormat.getChannels(),
srcFormat.getFrameSize(),
srcFormat.getFrameRate(),
srcFormat.isBigEndian());
AudioInputStream convertedIn = AudioSystem.getAudioInputStream(dstFormat, audioIn);
File file = new File(path);
WaveFileWriter writer = new WaveFileWriter();
writer.write(convertedIn, AudioFileFormat.Type.WAVE, file);
}
}