Using Alibaba cloud TTS to realize web voice broadcast

I. opening Alibaba cloud TTS service

Click "apply for opening" and create a project in "management console"

Copy token and appkey

II. Interface with speech synthesis api

Because the sdk needs to introduce many third-party jar packages, it is recommended to dock RESTful API

copy the demo code in the interface document, paste the application into token and appkey, and it can run directly. Demo will generate a syAudio.wav file, which can be played directly by the language player.

According to the document prompts, two jar packages need to be introduced into the project:

<dependency>
    <groupId>com.squareup.okhttp3</groupId>
    <artifactId>okhttp</artifactId>
    <version>3.9.1</version>
</dependency>
<!-- http://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.42</version>
</dependency>

Voice generation tool class:

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alibaba.fastjson.JSONObject;
import com.hsoft.commutil.props.PropertiesUtil;

import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
public class SpeechRestfulUtil {
	private static Logger logger = LoggerFactory.getLogger(SpeechRestfulUtil.class);
    private String accessToken;
    private String appkey;
    private static SpeechRestfulUtil instance;

	private static SpeechRestfulUtil getInstance() {
		if (instance == null) {
			synchronized (SpeechRestfulUtil.class) {
				if (instance == null) {
					String appkey = PropertiesUtil.getProperty("aliyun.voice.appkey");
					String token = PropertiesUtil.getProperty("aliyun.voice.token");
					instance = new SpeechRestfulUtil(appkey, token);
				}
			}
		}
		return instance;
	}
    
    private SpeechRestfulUtil(String appkey, String token) {
        this.appkey = appkey;
        this.accessToken = token;
    }
    /**
     * HTTPS GET request
     */
    private byte[] processGETRequet(String text, String format, int sampleRate) {
        /**
         * Set HTTPS GET request
         * 1.Use HTTPS protocol
         * 2.Domain name of speech recognition service: nls-gateway.cn-shanghai.aliyuncs.com
         * 3.Voice recognition interface request path / stream/v1/tts
         * 4.Set the required parameters: appkey, token, text, format, sample rate.
         * 5.Set optional request parameters: voice, volume, speech rate, pitch rate
         */
        String url = "https://nls-gateway.cn-shanghai.aliyuncs.com/stream/v1/tts";
        url = url + "?appkey=" + appkey;
        url = url + "&token=" + accessToken;
        url = url + "&text=" + text;
        url = url + "&format=" + format;
        url = url + "&sample_rate=" + String.valueOf(sampleRate);
        // voice pronunciation person, optional, default is xiaoyun
        // url = url + "&voice=" + "xiaoyun";
        // Volume volume, range 0-100, optional, default 50
        // url = url + "&volume=" + String.valueOf(50);
        // Speech rate, range is - 500 ~ 500, optional, default is 0
         url = url + "&speech_rate=" + String.valueOf(100);
        // pitch_rate tone, range is - 500 ~ 500, optional, default is 0
        // url = url + "&pitch_rate=" + String.valueOf(0);
//        System.out.println("URL: " + url);
        /**
         * Send HTTPS GET request and process the response of the server
         */
        Request request = new Request.Builder()
                .url(url)
                .get()
                .build();
        byte[] bytes=null;
        try {
            OkHttpClient client = new OkHttpClient();
            Response response = client.newCall(request).execute();
            String contentType = response.header("Content-Type");
            if ("audio/mpeg".equals(contentType)) {
            	bytes =response.body().bytes();
//                File f = new File(audioSaveFile);
//                FileOutputStream fout = new FileOutputStream(f);
//                fout.write(response.body().bytes());
//                fout.close();
//                System.out.println(f.getAbsolutePath());
            	logger.info("The GET SpeechRestful succeed!");
            }
            else {
                // ContentType is null or "application/json"
                String errorMessage = response.body().string();
                logger.info("The GET SpeechRestful failed: " + errorMessage);
            }
            response.close();
        } catch (Exception e) {
            logger.error("processGETRequet",e);
        }
        return bytes;
    }
    /**
     * HTTPS POST request
     */
    private byte[] processPOSTRequest(String text, String audioSaveFile, String format, int sampleRate) {
        /**
         * Set HTTPS POST request
         * 1.Use HTTPS protocol
         * 2.Domain name of voice synthesis service: nls-gateway.cn-shanghai.aliyuncs.com
         * 3.Voice synthesis interface request path / stream/v1/tts
         * 4.Set the required parameters: appkey, token, text, format, sample rate.
         * 5.Set optional request parameters: voice, volume, speech rate, pitch rate
         */
        String url = "https://nls-gateway.cn-shanghai.aliyuncs.com/stream/v1/tts";
        JSONObject taskObject = new JSONObject();
        taskObject.put("appkey", appkey);
        taskObject.put("token", accessToken);
        taskObject.put("text", text);
        taskObject.put("format", format);
        taskObject.put("sample_rate", sampleRate);
        // voice pronunciation person, optional, default is xiaoyun
        // taskObject.put("voice", "xiaoyun");
        // Volume volume, range 0-100, optional, default 50
        // taskObject.put("volume", 50);
        // Speech rate, range is - 500 ~ 500, optional, default is 0
        // taskObject.put("speech_rate", 0);
        // pitch_rate tone, range is - 500 ~ 500, optional, default is 0
        // taskObject.put("pitch_rate", 0);
        String bodyContent = taskObject.toJSONString();
//        System.out.println("POST Body Content: " + bodyContent);
        RequestBody reqBody = RequestBody.create(MediaType.parse("application/json"), bodyContent);
        Request request = new Request.Builder()
                .url(url)
                .header("Content-Type", "application/json")
                .post(reqBody)
                .build();
        
        byte[] bytes=null;
        try {
            OkHttpClient client = new OkHttpClient();
            Response response = client.newCall(request).execute();
            String contentType = response.header("Content-Type");
            if ("audio/mpeg".equals(contentType)) {
                bytes = response.body().bytes();
                logger.info("The POST SpeechRestful succeed!");
            }
            else {
                // ContentType is null or "application/json"
                String errorMessage = response.body().string();
                logger.info("The POST SpeechRestful failed: " + errorMessage);
            }
            response.close();
        } catch (Exception e) {
        	logger.error("processPOSTRequest",e);
        }
        return bytes;
    }
    
    public static byte[] text2voice(String text) {
    	if (StringUtils.isBlank(text)) {
    		return null;
    	}
        SpeechRestfulUtil demo = SpeechRestfulUtil.getInstance();
//        String text = "member collection: 87.12 yuan";
        // urlencode with RFC 3986
        String textUrlEncode = text;
        try {
            textUrlEncode = URLEncoder.encode(textUrlEncode, "UTF-8")
                    .replace("+", "%20")
                    .replace("*", "%2A")
                    .replace("%7E", "~");
        } catch (UnsupportedEncodingException e) {
        	logger.error("encode",e);
        }
//        String audioSaveFile = "syAudio.wav";
        String format = "wav";
        int sampleRate = 16000;
       return demo.processGETRequet(textUrlEncode, format, sampleRate);
    }
    
}

3. Integrate websocket

Of course, our goal is not to get an audio file, but to hear the sound directly on the web site.

To achieve this, we need to introduce Websocket, push the audio resource to the web page directly, and then use FileReader object to play it directly.

1. Introducing jar package

<dependency>
	<groupId>org.springframework.boot</groupId>
	<artifactId>spring-boot-starter-websocket</artifactId>
	<exclusions>
		<exclusion>
			<groupId>org.slf4j</groupId>
			<artifactId>log4j-over-slf4j</artifactId>
		</exclusion>
		<exclusion>
			<groupId>org.hibernate</groupId>
			<artifactId>hibernate-validator</artifactId>
		</exclusion>
	</exclusions>
</dependency>

2. Create a Websocket processing class

public class VoiceHandler extends AbstractWebSocketHandler {
	private static final Logger logger = LoggerFactory.getLogger(VoiceHandler.class);
	
	@Override
	public void afterConnectionEstablished(WebSocketSession session) throws Exception {
		VoicePool.add(session);
	}
	
	@Override
	public void afterConnectionClosed(WebSocketSession session, CloseStatus status) throws Exception {
		VoicePool.remove(session);
	}
	
	
	@Override
	protected void handleTextMessage(WebSocketSession session, TextMessage message) throws Exception {
		logger.debug("receive Msg :" + message.getPayload());
        TextMessage msg=new TextMessage(message.getPayload());
        session.sendMessage(msg);
	}

}

3. Create websocket connection pool management class

public class VoicePool {
	private static final Logger logger = LoggerFactory.getLogger(VoicePool.class);
	private static Map<String, WebSocketSession> pool = new ConcurrentHashMap<String, WebSocketSession>();
	private static Map<Long, List<String>> userMap = new ConcurrentHashMap<Long, List<String>>();
	private static final ExecutorService threadPool = Executors.newFixedThreadPool(50);

	public static void add(WebSocketSession inbound) {
		pool.put(inbound.getId(), inbound);
		Map<String, String> map = ParamUtil.parser(inbound.getUri().getQuery());
		Long companyId = Long.valueOf(map.get("companyId"));
		logger.info("add companyId:{}", companyId);
		List<String> lstInBound = null;
		if (companyId != null) {
			lstInBound = userMap.get(companyId);
			if (lstInBound == null) {
				lstInBound = new ArrayList<String>();
				userMap.put(companyId, lstInBound);
			}
			lstInBound.add(inbound.getId());
		}
		logger.info("add connetion {},total size {}", inbound.getId(), pool.size());
	}

	public static void remove(WebSocketSession socket) {
		String sessionId = socket.getId();
		List<String> lstInBound = null;
		Map<String, String> map = ParamUtil.parser(socket.getUri().getQuery());
		Long companyId = Long.valueOf(map.get("companyId"));
		logger.info("remove companyId:{}", companyId);
		if (StringUtils.isNotBlank(sessionId)) {
			if (companyId != null) {
				lstInBound = userMap.get(companyId);
				if (lstInBound != null) {
					lstInBound.remove(sessionId);
					if (lstInBound.isEmpty()) {
						userMap.remove(companyId);
					}
				}
			}
		}

		pool.remove(sessionId);
		logger.info("remove connetion {},total size {}", sessionId, pool.size());
	}

	/** Push information */
	public static void broadcast(VoiceMsgVo vo) {
		Long companyId = vo.getCompanyId();
		if (companyId == null || companyId == 0L) {
			return;
		}
		List<String> lstInBoundId = userMap.get(companyId);
		if (lstInBoundId == null || lstInBoundId.isEmpty()) {
			return;
		}
		byte[] bytes = SpeechRestfulUtil.text2voice(vo.getText());
		if (bytes == null) {
			return;
		}
		threadPool.execute(() -> {
			try {
				logger.info("send to companyId:{}", companyId);
				for (String id : lstInBoundId) {
					// Send to specified user
					WebSocketSession connection = pool.get(id);
					if (connection != null) {
						BinaryMessage msg = new BinaryMessage(bytes);
						connection.sendMessage(msg);
					}
				}
			} catch (Exception e) {
				logger.error("broadcast error: companyId:{}", companyId, e);
			}
		});
	}

}

Message object bean

public class VoiceMsgVo {
	private String text;
	private Long companyId;
}

4. Websocket configuration

@Configuration
@EnableWebSocket
public class WebSocketConfig implements WebSocketConfigurer {

	@Override
	public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) {
		registry.addHandler(voiceHandler(), "/ws/voice").setAllowedOrigins("*");
	}

	@Bean
	public VoiceHandler voiceHandler() {
		return new VoiceHandler();
	}

}

5. Front end js processing

Create any page, and introduce the following js

var audioContext = new (window.AudioContext || window.webkitAudioContext)();
var Chat = {};
Chat.socket = null;
Chat.connect = (function(host) {
	if ("WebSocket" in window) {
		Chat.socket = new WebSocket(host);
	} else if ("MozWebSocket" in window) {
		Chat.socket = new MozWebSocket(host);
	} else {
		Console.log("Error: WebSocket is not supported by this browser.");
		return;
	}
	Chat.socket.onopen = function() {
		Console.log("Info: Voice broadcast started.");
		// Heartbeat detection reset
		heartCheck.reset().start(Chat.socket);
	};
	Chat.socket.onclose = function() {
		Console.log("Info: Voice broadcast is off.");
	};
	Chat.socket.onmessage = function(message) {
		
		heartCheck.reset().start(Chat.socket);
		if (message.data == null || message.data == '' || "HeartBeat" == message.data){
			//Heartbeat message
			return;
		}
		
		
		var reader = new FileReader();
		reader.onload = function(evt) {
			if (evt.target.readyState == FileReader.DONE) {
				audioContext.decodeAudioData(evt.target.result,
						function(buffer) {
							// Decode to pcm stream
							var audioBufferSouceNode = audioContext
									.createBufferSource();
							audioBufferSouceNode.buffer = buffer;
							audioBufferSouceNode
									.connect(audioContext.destination);
							audioBufferSouceNode.start(0);
						}, function(e) {
							console.log(e);
						});
			}
		};
		reader.readAsArrayBuffer(message.data);
	};
});
Chat.initialize = function() {
	Chat.companyId = _currCompanyId;
	if (window.location.protocol == "http:") {
		Chat.connect("ws://" + window.location.host + "/ws/voice?companyId="+Chat.companyId);
	} else {
		Chat.connect("wss://" + window.location.host + "/ws/voice?companyId="+Chat.companyId);
	}
};
Chat.sendMessage = (function() {
	var message = document.getElementById("chat").value;
	if (message != "") {
		Chat.socket.send(message);
		document.getElementById("chat").value = "";
	}
});
var Console = {};
Console.log = (function(message) {
	
	var _console=document.getElementById("console");
	if (_console==null || _console==undefined){
		console.log(message);
		return;
	}
    var p=document.createElement("p");
    p.style.wordWrap="break-word";
    p.innerHTML=message;
    _console.appendChild(p);
    while(_console.childNodes.length>25) 
    {
    	_console.removeChild(_console.firstChild);
    }
    _console.scrollTop=_console.scrollHeight;
});
Chat.initialize();


//Heartbeat detection
var heartCheck = {
	timeout : 60000,// 60 seconds
	timeoutObj : null,
	serverTimeoutObj : null,
	reset : function() {
		clearTimeout(this.timeoutObj);
		clearTimeout(this.serverTimeoutObj);
		return this;
	},
	start : function(ws) {
		var self = this;
		this.timeoutObj = setTimeout(function() {
			// A heartbeat is sent here. When the backend receives it, it returns a heartbeat message.
			// onmessage gets the returned heartbeat, which means the connection is normal.
//			console.log('start heartCheck');
			ws.send("HeartBeat");
			self.serverTimeoutObj = setTimeout(function() {// If it has not been reset for a certain period of time, it means that the backend is disconnected actively.
				ws.close();// If onclose will execute reconnect, we can execute ws.close(). If we directly execute reconnect
							// It will trigger onclose and cause reconnection twice.
			}, self.timeout)
		}, this.timeout)
	}
}

IV. start engineering test

Start the project and send a message from the background

VoiceMsgVo vo = new VoiceMsgVo();
vo.setCompanyId(1L);
vo.setText("What a nice day today! I went out for a walk");
VoicePool.broadcast(vo);

Posted by ashbai on Sat, 02 Nov 2019 02:14:18 -0700

Programmer Group