With the development of Flink table & SQL, dimension table Join in Flink SQL has become the choice for many scenarios.
Based on the previous summary, the implementation of dimension table Join in Flink table & SQL is summarized again, including dimension table Join in DataStream.
- Regularly load dimension data
- Distributed cache
- Async IO
- Broadcast state
- Udtf + final table syntax
- LookupableTableSource
Regularly load dimension data
Implementation mode
- Implement RichFlatMapFunction, start a thread in the open() method to read dimension data regularly and load it into memory.
- Implement dimension association in the flatMap() method.
Code example
package com.bigdata.flink.dimJoin; import lombok.extern.slf4j.Slf4j; import org.apache.flink.api.common.functions.RichFlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.configuration.Configuration; import org.apache.flink.util.Collector; import java.sql.*; import java.util.HashMap; import java.util.Timer; import java.util.TimerTask; /** * Regularly load dimension data into memory */ @Slf4j public class DimRichFlatMapFunction extends RichFlatMapFunction<UserBrowseLog, Tuple2<UserBrowseLog, UserInfo>> { private final String url; private final String user; private final String passwd; private final Integer reloadInterval; private Connection connection; private final String JDBC_DRIVER = "com.mysql.cj.jdbc.Driver"; HashMap dimInfo = new HashMap<String, UserInfo>(); public DimRichFlatMapFunction(String url, String user, String passwd, Integer reloadInterval) { this.url = url; this.user = user; this.passwd = passwd; this.reloadInterval = reloadInterval; } /** * open a connection * Regularly load dimension data * * @param parameters * @throws Exception */ @Override public void open(Configuration parameters) throws Exception { super.open(parameters); Class.forName(JDBC_DRIVER); TimerTask timerTask = new TimerTask() { @Override public void run() { try { if (connection == null || connection.isClosed()) { log.warn("No connection. Trying to reconnect..."); connection = DriverManager.getConnection(url, user, passwd); } String sql = "select uid,name,age,address from t_user_info"; PreparedStatement preparedStatement = connection.prepareStatement(sql); ResultSet resultSet = preparedStatement.executeQuery(); while (resultSet.next()) { UserInfo userInfo = new UserInfo(); userInfo.setUid(resultSet.getString("uid")); userInfo.setName(resultSet.getString("name")); userInfo.setAge(resultSet.getInt("age")); userInfo.setAddress(resultSet.getString("address")); dimInfo.put(userInfo.getUid(), userInfo); } } catch (SQLException e) { log.error("Get dimension data exception...", e); } } }; Timer timer = new Timer(); timer.scheduleAtFixedRate(timerTask, 0, reloadInterval * 1000); } /** * Close connection */ @Override public void close() throws Exception { super.close(); if (connection != null) { connection.close(); } } /** * Dimension association * @param value * @param out * @throws Exception */ @Override public void flatMap(UserBrowseLog value, Collector<Tuple2<UserBrowseLog, UserInfo>> out) throws Exception { String userID = value.getUserID(); if (dimInfo.containsKey(userID)) { UserInfo dim = (UserInfo) dimInfo.get(userID); out.collect(new Tuple2<>(value, dim)); } } }
- be careful
- Because the data is stored in memory, only small data dimension tables are supported.
- It is loaded regularly. It is only applicable to dimension tables that are updated less frequently.
Distributed cache
Implementation mode
- Register local or HDFS cache files through env.registerCachedFile(cachedFilePath, cachedFileName).
- When the program starts, Flink automatically distributes files to the TaskManager file system.
- Implement the RichFlatMapFunction, obtain and parse the cache file through RuntimeContext in the open() method.
- The parsed data is in memory. At this time, dimension association can be realized in the flatMap() method.
Code example
package com.bigdata.flink.dimJoin; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import org.apache.flink.api.common.functions.RichFlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; import java.io.File; import java.util.HashMap; import java.util.List; /** * Dimension association through Distributed Cache */ @Slf4j public class DistributedCacheJoinDim { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // Register cache files, such as: file:///some/path Or hdfs://host:port/and/path String cachedFilePath = "./user_info.txt"; String cachedFileName = "user_info"; env.registerCachedFile(cachedFilePath, cachedFileName); // Add live stream DataStreamSource<Tuple2<String, String>> stream = env.fromElements( Tuple2.of("1", "click"), Tuple2.of("2", "click"), Tuple2.of("3", "browse")); // Association dimension SingleOutputStreamOperator<String> dimedStream = stream.flatMap(new RichFlatMapFunction<Tuple2<String, String>, String>() { HashMap dimInfo = new HashMap<String, Integer>(); // read file @Override public void open(Configuration parameters) throws Exception { super.open(parameters); File cachedFile = getRuntimeContext().getDistributedCache().getFile(cachedFileName); List<String> lines = FileUtils.readLines(cachedFile); for (String line : lines) { String[] split = line.split(","); dimInfo.put(split[0], Integer.valueOf(split[1])); } } // Association dimension @Override public void flatMap(Tuple2<String, String> value, Collector<String> out) throws Exception { if (dimInfo.containsKey(value.f0)) { Integer age = (Integer) dimInfo.get(value.f0); out.collect(value.f0 + "," + value.f1 + "," + age); } } }); dimedStream.print(); env.execute(); } }
- be careful
- Because the data is stored in memory, only small data dimension tables are supported.
- Load at startup. When the dimension table changes, you need to restart the task.
Distributed cache
Implementation mode
- Register local or HDFS cache files through env.registerCachedFile(cachedFilePath, cachedFileName).
- When the program starts, Flink automatically distributes files to the TaskManager file system.
- Implement the RichFlatMapFunction, obtain and parse the cache file through RuntimeContext in the open() method.
- The parsed data is in memory. At this time, dimension association can be realized in the flatMap() method.
Code example
package com.bigdata.flink.dimJoin; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import org.apache.flink.api.common.functions.RichFlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; import java.io.File; import java.util.HashMap; import java.util.List; /** * Author: Wang Pei * Summary: * Dimension association through Distributed Cache */ @Slf4j public class DistributedCacheJoinDim { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // Register cache files, such as: file:///some/path Or hdfs://host:port/and/path String cachedFilePath = "./user_info.txt"; String cachedFileName = "user_info"; env.registerCachedFile(cachedFilePath, cachedFileName); // Add live stream DataStreamSource<Tuple2<String, String>> stream = env.fromElements( Tuple2.of("1", "click"), Tuple2.of("2", "click"), Tuple2.of("3", "browse")); // Association dimension SingleOutputStreamOperator<String> dimedStream = stream.flatMap(new RichFlatMapFunction<Tuple2<String, String>, String>() { HashMap dimInfo = new HashMap<String, Integer>(); // read file @Override public void open(Configuration parameters) throws Exception { super.open(parameters); File cachedFile = getRuntimeContext().getDistributedCache().getFile(cachedFileName); List<String> lines = FileUtils.readLines(cachedFile); for (String line : lines) { String[] split = line.split(","); dimInfo.put(split[0], Integer.valueOf(split[1])); } } // Association dimension @Override public void flatMap(Tuple2<String, String> value, Collector<String> out) throws Exception { if (dimInfo.containsKey(value.f0)) { Integer age = (Integer) dimInfo.get(value.f0); out.collect(value.f0 + "," + value.f1 + "," + age); } } }); dimedStream.print(); env.execute(); } }
- be careful
- Because the data is stored in memory, only small data dimension tables are supported.
- Load at startup. When the dimension table changes, you need to restart the task.
Async IO
Implementation mode
- Dimension data is stored externally, such as ES, Redis and HBase.
- Query dimension data through asynchronous IO
- Combined with local cache such as Guava Cache to reduce access to external storage.
Code example
package com.bigdata.flink; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.apache.flink.api.common.functions.FilterFunction; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.java.tuple.Tuple4; import org.apache.flink.api.java.tuple.Tuple5; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.configuration.*; import org.apache.flink.streaming.api.datastream.AsyncDataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010; import java.util.Properties; import java.util.concurrent.TimeUnit; /** * Join flow table and dimension table with Async I/O */ public class FlinkAsyncIO { public static void main(String[] args) throws Exception{ /**Parsing command line parameters*/ ParameterTool parameterTool = ParameterTool.fromArgs(args); String kafkaBootstrapServers = parameterTool.get("kafka.bootstrap.servers"); String kafkaGroupID = parameterTool.get("kafka.group.id"); String kafkaAutoOffsetReset= parameterTool.get("kafka.auto.offset.reset"); String kafkaTopic = parameterTool.get("kafka.topic"); int kafkaParallelism =parameterTool.getInt("kafka.parallelism"); String esHost= parameterTool.get("es.host"); Integer esPort= parameterTool.getInt("es.port"); String esUser = parameterTool.get("es.user"); String esPassword = parameterTool.get("es.password"); String esIndex = parameterTool.get("es.index"); String esType = parameterTool.get("es.type"); /**Flink DataStream Operating environment*/ Configuration config = new Configuration(); config.setInteger(RestOptions.PORT,8081); config.setBoolean(ConfigConstants.LOCAL_START_WEBSERVER, true); StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(config); /**add data source*/ Properties kafkaProperties = new Properties(); kafkaProperties.put("bootstrap.servers",kafkaBootstrapServers); kafkaProperties.put("group.id",kafkaGroupID); kafkaProperties.put("auto.offset.reset",kafkaAutoOffsetReset); FlinkKafkaConsumer010<String> kafkaConsumer = new FlinkKafkaConsumer010<>(kafkaTopic, new SimpleStringSchema(), kafkaProperties); kafkaConsumer.setCommitOffsetsOnCheckpoints(true); SingleOutputStreamOperator<String> source = env.addSource(kafkaConsumer).name("KafkaSource").setParallelism(kafkaParallelism); //data conversion SingleOutputStreamOperator<Tuple4<String, String, String, Integer>> sourceMap = source.map((MapFunction<String, Tuple4<String, String, String, Integer>>) value -> { Tuple4<String, String, String, Integer> output = new Tuple4<>(); try { JSONObject obj = JSON.parseObject(value); output.f0 = obj.getString("userID"); output.f1 = obj.getString("eventTime"); output.f2 = obj.getString("eventType"); output.f3 = obj.getInteger("productID"); } catch (Exception e) { e.printStackTrace(); } return output; }).returns(new TypeHint<Tuple4<String, String, String, Integer>>(){}).name("Map: ExtractTransform"); //Filter out abnormal data SingleOutputStreamOperator<Tuple4<String, String, String, Integer>> sourceFilter = sourceMap.filter((FilterFunction<Tuple4<String, String, String, Integer>>) value -> value.f3 != null).name("Filter: FilterExceptionData"); //Timeout: timeout. When the asynchronous I/O request is timed out by default, an exception will be thrown and the job will be restarted or stopped. If you want to handle timeouts, you can override the AsyncFunction#timeout method. //Capacity: number of concurrent requests /**Async IO Join flow table and dimension table*/ SingleOutputStreamOperator<Tuple5<String, String, String, Integer, Integer>> result = AsyncDataStream.orderedWait(sourceFilter, new ElasticsearchAsyncFunction(esHost,esPort,esUser,esPassword,esIndex,esType), 500, TimeUnit.MILLISECONDS, 10).name("Join: JoinWithDim"); /**Result output*/ result.print().name("PrintToConsole"); env.execute(); } }
ElasticsearchAsyncFunction:
package com.bigdata.flink; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.cache.RemovalListener; import com.google.common.cache.RemovalNotification; import org.apache.flink.api.java.tuple.Tuple4; import org.apache.flink.api.java.tuple.Tuple5; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.async.ResultFuture; import org.apache.flink.streaming.api.functions.async.RichAsyncFunction; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CredentialsProvider; import org.apache.http.impl.client.BasicCredentialsProvider; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; import java.util.Collections; import java.util.concurrent.TimeUnit; /** * Customize ElasticsearchAsyncFunction to query dimension data from ES */ public class ElasticsearchAsyncFunction extends RichAsyncFunction<Tuple4<String, String, String, Integer>, Tuple5<String, String, String, Integer,Integer>> { private String host; private Integer port; private String user; private String password; private String index; private String type; public ElasticsearchAsyncFunction(String host, Integer port, String user, String password, String index, String type) { this.host = host; this.port = port; this.user = user; this.password = password; this.index = index; this.type = type; } private RestHighLevelClient restHighLevelClient; private Cache<String,Integer> cache; /** * Establish connection with ES * @param parameters */ @Override public void open(Configuration parameters){ //ES Client CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(user, password)); restHighLevelClient = new RestHighLevelClient( RestClient .builder(new HttpHost(host, port)) .setHttpClientConfigCallback(httpAsyncClientBuilder -> httpAsyncClientBuilder.setDefaultCredentialsProvider(credentialsProvider))); //Initialize cache cache=CacheBuilder.newBuilder().maximumSize(2).expireAfterAccess(5, TimeUnit.MINUTES).build(); } /** * Close connection * @throws Exception */ @Override public void close() throws Exception { restHighLevelClient.close(); } /** * Asynchronous call * @param input * @param resultFuture */ @Override public void asyncInvoke(Tuple4<String, String, String, Integer> input, ResultFuture<Tuple5<String, String, String, Integer, Integer>> resultFuture) { // 1. Fetch from cache first Integer cachedValue = cache.getIfPresent(input.f0); if(cachedValue !=null){ System.out.println("Get dimension data from cache: key="+input.f0+",value="+cachedValue); resultFuture.complete(Collections.singleton(new Tuple5<>(input.f0,input.f1,input.f2,input.f3,cachedValue))); // 2. If there is no in the cache, it is obtained from external storage }else { searchFromES(input,resultFuture); } } /** * When there is no data in the cache, it is obtained from the external storage ES * @param input * @param resultFuture */ private void searchFromES(Tuple4<String, String, String, Integer> input, ResultFuture<Tuple5<String, String, String, Integer, Integer>> resultFuture){ // 1. Construct output object Tuple5<String, String, String, Integer, Integer> output = new Tuple5<>(); output.f0=input.f0; output.f1=input.f1; output.f2=input.f2; output.f3=input.f3; // 2. Key to be queried String dimKey = input.f0; // 3. Construct Ids Query SearchRequest searchRequest = new SearchRequest(); searchRequest.indices(index); searchRequest.types(type); searchRequest.source(SearchSourceBuilder.searchSource().query(QueryBuilders.idsQuery().addIds(dimKey))); // 4. Query data with asynchronous client restHighLevelClient.searchAsync(searchRequest, new ActionListener<SearchResponse>() { //Process on successful response @Override public void onResponse(SearchResponse searchResponse) { SearchHit[] searchHits = searchResponse.getHits().getHits(); if(searchHits.length >0 ){ JSONObject obj = JSON.parseObject(searchHits[0].getSourceAsString()); Integer dimValue=obj.getInteger("age"); output.f4=dimValue; cache.put(dimKey,dimValue); System.out.println("Put dimension data into cache: key="+dimKey+",value="+dimValue); } resultFuture.complete(Collections.singleton(output)); } //Process when response fails @Override public void onFailure(Exception e) { output.f4=null; resultFuture.complete(Collections.singleton(output)); } }); } //Timeout processing @Override public void timeout(Tuple4<String, String, String, Integer> input, ResultFuture<Tuple5<String, String, String, Integer, Integer>> resultFuture) { searchFromES(input,resultFuture); } }
- be careful
- This method is not limited to memory and can support dimension data with a large amount of data.
- External storage support is required.
- Access to external storage should be minimized.
Broadcast State
Implementation mode
- Send dimension data to Kafka as stream S1. The fact data is stream S2.
- Define the state descriptor MapStateDescriptor, such as descriptor.
- Combined with the state descriptor, S1 is broadcast, such as S1.broadcast(descriptor), forming a broadcast stream B1.
- The fact stream S2 is connected with the broadcast stream B1 to form the connected stream BroadcastConnectedStream BC.
- Based on BC flow, the logical processing of Join is realized in keyedroadcastprocessfunction / broadcastprocessfunction.
Code example
package com.bigdata.flink; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import lombok.extern.slf4j.Slf4j; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.common.state.BroadcastState; import org.apache.flink.api.common.state.MapStateDescriptor; import org.apache.flink.api.common.state.ReadOnlyBroadcastState; import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.tuple.*; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.StateBackend; import org.apache.flink.runtime.state.filesystem.FsStateBackend; import org.apache.flink.shaded.guava18.com.google.common.collect.Maps; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.datastream.*; import org.apache.flink.streaming.api.environment.CheckpointConfig; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction; import org.apache.flink.streaming.api.functions.source.RichSourceFunction; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010; import org.apache.flink.util.Collector; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.util.HashMap; import java.util.Map; import java.util.Properties; /** * Dynamically update the configuration based on Broadcast State to filter data in real time and add fields */ @Slf4j public class TestBroadcastState { public static void main(String[] args) throws Exception{ //1. Parsing command line parameters ParameterTool fromArgs = ParameterTool.fromArgs(args); ParameterTool parameterTool = ParameterTool.fromPropertiesFile(fromArgs.getRequired("applicationProperties")); //checkpoint configuration String checkpointDirectory = parameterTool.getRequired("checkpointDirectory"); long checkpointSecondInterval = parameterTool.getLong("checkpointSecondInterval"); //Event flow configuration String fromKafkaBootstrapServers = parameterTool.getRequired("fromKafka.bootstrap.servers"); String fromKafkaGroupID = parameterTool.getRequired("fromKafka.group.id"); String fromKafkaTopic = parameterTool.getRequired("fromKafka.topic"); //Configure flow configuration String fromMysqlHost = parameterTool.getRequired("fromMysql.host"); int fromMysqlPort = parameterTool.getInt("fromMysql.port"); String fromMysqlDB = parameterTool.getRequired("fromMysql.db"); String fromMysqlUser = parameterTool.getRequired("fromMysql.user"); String fromMysqlPasswd = parameterTool.getRequired("fromMysql.passwd"); int fromMysqlSecondInterval = parameterTool.getInt("fromMysql.secondInterval"); //2. Configure running environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); //Set StateBackend env.setStateBackend((StateBackend) new FsStateBackend(checkpointDirectory, true)); //Set Checkpoint CheckpointConfig checkpointConfig = env.getCheckpointConfig(); checkpointConfig.setCheckpointInterval(checkpointSecondInterval * 1000); checkpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); checkpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); //3. Kafka event flow //Get event data from Kafka //Data: a user browses or clicks a product at a certain time, such as //{"userID": "user_3", "eventTime": "2019-08-17 12:19:47", "eventType": "browse", "productID": 1} Properties kafkaProperties = new Properties(); kafkaProperties.put("bootstrap.servers",fromKafkaBootstrapServers); kafkaProperties.put("group.id",fromKafkaGroupID); FlinkKafkaConsumer010<String> kafkaConsumer = new FlinkKafkaConsumer010<>(fromKafkaTopic, new SimpleStringSchema(), kafkaProperties); kafkaConsumer.setStartFromLatest(); DataStream<String> kafkaSource = env.addSource(kafkaConsumer).name("KafkaSource").uid("source-id-kafka-source"); SingleOutputStreamOperator<Tuple4<String, String, String, Integer>> eventStream = kafkaSource.process(new ProcessFunction<String, Tuple4<String, String, String, Integer>>() { @Override public void processElement(String value, Context ctx, Collector<Tuple4<String, String, String, Integer>> out){ try { JSONObject obj = JSON.parseObject(value); String userID = obj.getString("userID"); String eventTime = obj.getString("eventTime"); String eventType = obj.getString("eventType"); int productID = obj.getIntValue("productID"); out.collect(new Tuple4<>(userID, eventTime, eventType, productID)); }catch (Exception ex){ log.warn("Abnormal data:{}",value,ex); } } }); //4. Mysql configuration flow //Customize the Mysql Source, periodically obtain the configuration from Mysql and broadcast it //Data: user ID, user name, user age DataStreamSource<HashMap<String, Tuple2<String, Integer>>> configStream = env.addSource(new MysqlSource(fromMysqlHost, fromMysqlPort, fromMysqlDB, fromMysqlUser, fromMysqlPasswd, fromMysqlSecondInterval)); /* (1) Create MapStateDescriptor first MapStateDescriptor Defines the name, Key, and Value type of the state. Here, in MapStateDescriptor, key is the Void type, and value is the map < string, tuple2 < string, int > > type. */ MapStateDescriptor<Void, Map<String, Tuple2<String,Integer>>> configDescriptor = new MapStateDescriptor<>("config", Types.VOID, Types.MAP(Types.STRING, Types.TUPLE(Types.STRING, Types.INT))); /* (2) Stream broadcasting will be configured to form a BroadcastStream */ BroadcastStream<HashMap<String, Tuple2<String, Integer>>> broadcastConfigStream = configStream.broadcast(configDescriptor); //5. The event stream is connected with the broadcast configuration stream to form a BroadcastConnectedStream BroadcastConnectedStream<Tuple4<String, String, String, Integer>, HashMap<String, Tuple2<String, Integer>>> connectedStream = eventStream.connect(broadcastConfigStream); //6. Apply the process method to BroadcastConnectedStream to process events according to the configuration (rules) SingleOutputStreamOperator<Tuple6<String, String, String, Integer, String, Integer>> resultStream = connectedStream.process(new CustomBroadcastProcessFunction()); //7. Output results resultStream.print(); //8. Generate the JobGraph and start execution env.execute(); } /** * Custom BroadcastProcessFunction * When the user ID in the event flow appears in the configuration, the event is processed, and the basic information of the user is supplemented in the event * Tuple4<String, String, String, Integer>: The data type of the first stream (event stream) * HashMap<String, Tuple2<String, Integer>>: Data type of the second flow (configuration flow) * Tuple6<String, String, String, Integer,String, Integer>: Data type returned */ static class CustomBroadcastProcessFunction extends BroadcastProcessFunction<Tuple4<String, String, String, Integer>, HashMap<String, Tuple2<String, Integer>>, Tuple6<String, String, String, Integer, String, Integer>>{ /**Define MapStateDescriptor*/ MapStateDescriptor<Void, Map<String, Tuple2<String,Integer>>> configDescriptor = new MapStateDescriptor<>("config", Types.VOID, Types.MAP(Types.STRING, Types.TUPLE(Types.STRING, Types.INT))); /** * Read the state and process the data in the event stream based on the state * Here, the state is obtained from the context, and the data in the event flow is processed based on the obtained state * @param value Data in event flow * @param ctx context * @param out Output zero or more data * @throws Exception */ @Override public void processElement(Tuple4<String, String, String, Integer> value, ReadOnlyContext ctx, Collector<Tuple6<String, String, String, Integer, String, Integer>> out) throws Exception { //User ID in event flow String userID = value.f0; //Get status ReadOnlyBroadcastState<Void, Map<String, Tuple2<String, Integer>>> broadcastState = ctx.getBroadcastState(configDescriptor); Map<String, Tuple2<String, Integer>> broadcastStateUserInfo = broadcastState.get(null); //If there is this user in the configuration, add the user's userName and userAge fields to the event. //If there is no such user in the configuration, it will be discarded Tuple2<String, Integer> userInfo = broadcastStateUserInfo.get(userID); if(userInfo!=null){ out.collect(new Tuple6<>(value.f0,value.f1,value.f2,value.f3,userInfo.f0,userInfo.f1)); } } /** * Process each piece of data in the broadcast stream and update the status * @param value Data in broadcast stream * @param ctx context * @param out Output zero or more data * @throws Exception */ @Override public void processBroadcastElement(HashMap<String, Tuple2<String, Integer>> value, Context ctx, Collector<Tuple6<String, String, String, Integer, String, Integer>> out) throws Exception { //Get status BroadcastState<Void, Map<String, Tuple2<String, Integer>>> broadcastState = ctx.getBroadcastState(configDescriptor); //Empty status broadcastState.clear(); //Update status broadcastState.put(null,value); } } /** * Customize the Mysql Source and get the configuration from Mysql every secondInterval seconds */ static class MysqlSource extends RichSourceFunction<HashMap<String, Tuple2<String, Integer>>> { private String host; private Integer port; private String db; private String user; private String passwd; private Integer secondInterval; private volatile boolean isRunning = true; private Connection connection; private PreparedStatement preparedStatement; MysqlSource(String host, Integer port, String db, String user, String passwd,Integer secondInterval) { this.host = host; this.port = port; this.db = db; this.user = user; this.passwd = passwd; this.secondInterval = secondInterval; } /** * At the beginning, the connection is established in the open() method * @param parameters * @throws Exception */ @Override public void open(Configuration parameters) throws Exception { super.open(parameters); Class.forName("com.mysql.jdbc.Driver"); connection= DriverManager.getConnection("jdbc:mysql://"+host+":"+port+"/"+db+"?useUnicode=true&characterEncoding=UTF-8", user, passwd); String sql="select userID,userName,userAge from user_info"; preparedStatement=connection.prepareStatement(sql); } /** * After execution, call the close() method to connect the relationship and release resources * @throws Exception */ @Override public void close() throws Exception { super.close(); if(connection!=null){ connection.close(); } if(preparedStatement !=null){ preparedStatement.close(); } } /** * Call the run() method to get the data * @param ctx */ @Override public void run(SourceContext<HashMap<String, Tuple2<String, Integer>>> ctx) { try { while (isRunning){ HashMap<String, Tuple2<String, Integer>> output = new HashMap<>(); ResultSet resultSet = preparedStatement.executeQuery(); while (resultSet.next()){ String userID = resultSet.getString("userID"); String userName = resultSet.getString("userName"); int userAge = resultSet.getInt("userAge"); output.put(userID,new Tuple2<>(userName,userAge)); } ctx.collect(output); //How many seconds does the query execute Thread.sleep(1000*secondInterval); } }catch (Exception ex){ log.error("from Mysql Get configuration exception...",ex); } } /** * This method is called when canceling */ @Override public void cancel() { isRunning = false; } } }
- be careful
- The change of dimension data needs to be converted into a flow in Kafka.
- The change of dimension can be perceived in real time.
- Dimension data is saved in memory, and the amount of data supported is relatively small.
Udtf + final table syntax
Implementation mode
- Suppose you use Flink SQL. First, customize the UTDF, inherit the TableFunction abstract class, and implement the open(), close(), and eval() methods.
- Register TableFunction.
- Use the final table syntax in SQL to correlate with the results of UDTF run.
Code example - Flink Table API
package com.bigdata.flink.tableSqlTemporalTable; import com.alibaba.fastjson.JSON; import com.bigdata.flink.beans.table.ProductInfo; import com.bigdata.flink.beans.table.UserBrowseLog; import lombok.extern.slf4j.Slf4j; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.TimeCharacteristic; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.java.StreamTableEnvironment; import org.apache.flink.table.functions.TemporalTableFunction; import org.apache.flink.types.Row; import org.apache.flink.util.Collector; import java.time.LocalDateTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; import java.util.Properties; /** * Summary: * Temporal table */ @Slf4j public class Test { public static void main(String[] args) throws Exception{ args=new String[]{"--application","flink/src/main/java/com/bigdata/flink/tableSqlTemporalTable/application.properties"}; //1. Parsing command line parameters ParameterTool fromArgs = ParameterTool.fromArgs(args); ParameterTool parameterTool = ParameterTool.fromPropertiesFile(fromArgs.getRequired("application")); //browse log String kafkaBootstrapServers = parameterTool.getRequired("kafkaBootstrapServers"); String browseTopic = parameterTool.getRequired("browseTopic"); String browseTopicGroupID = parameterTool.getRequired("browseTopicGroupID"); //product history info String productInfoTopic = parameterTool.getRequired("productHistoryInfoTopic"); String productInfoGroupID = parameterTool.getRequired("productHistoryInfoGroupID"); //2. Set up the running environment StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration()); streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings); streamEnv.setParallelism(1); //3. Register Kafka data source //Note: in order to have an intuitive understanding between Beijing time and timestamp, a field eventtimestamp is added to the UserBrowseLog as the timestamp of eventTime Properties browseProperties = new Properties(); browseProperties.put("bootstrap.servers",kafkaBootstrapServers); browseProperties.put("group.id",browseTopicGroupID); DataStream<UserBrowseLog> browseStream=streamEnv .addSource(new FlinkKafkaConsumer010<>(browseTopic, new SimpleStringSchema(), browseProperties)) .process(new BrowseKafkaProcessFunction()) .assignTimestampsAndWatermarks(new BrowseTimestampExtractor(Time.seconds(0))); tableEnv.registerDataStream("browse",browseStream,"userID,eventTime,eventTimeTimestamp,eventType,productID,productPrice,browseRowtime.rowtime"); //tableEnv.toAppendStream(tableEnv.scan("browse"),Row.class).print(); //4. Register temporal table //Note: in order to have an intuitive understanding between Beijing time and timestamp, a field updatedAtTimestamp is added to ProductInfo as the timestamp of updatedAt Properties productInfoProperties = new Properties(); productInfoProperties.put("bootstrap.servers",kafkaBootstrapServers); productInfoProperties.put("group.id",productInfoGroupID); DataStream<ProductInfo> productInfoStream=streamEnv .addSource(new FlinkKafkaConsumer010<>(productInfoTopic, new SimpleStringSchema(), productInfoProperties)) .process(new ProductInfoProcessFunction()) .assignTimestampsAndWatermarks(new ProductInfoTimestampExtractor(Time.seconds(0))); tableEnv.registerDataStream("productInfo",productInfoStream, "productID,productName,productCategory,updatedAt,updatedAtTimestamp,productInfoRowtime.rowtime"); //Set the time attribute and primary key of the temporary table TemporalTableFunction productInfo = tableEnv.scan("productInfo").createTemporalTableFunction("productInfoRowtime", "productID"); //Register TableFunction tableEnv.registerFunction("productInfoFunc",productInfo); //tableEnv.toAppendStream(tableEnv.scan("productInfo"),Row.class).print(); //5. Run SQL String sql = "" + "SELECT " + "browse.userID, " + "browse.eventTime, " + "browse.eventTimeTimestamp, " + "browse.eventType, " + "browse.productID, " + "browse.productPrice, " + "productInfo.productID, " + "productInfo.productName, " + "productInfo.productCategory, " + "productInfo.updatedAt, " + "productInfo.updatedAtTimestamp " + "FROM " + " browse, " + " LATERAL TABLE (productInfoFunc(browse.browseRowtime)) as productInfo " + "WHERE " + " browse.productID=productInfo.productID"; Table table = tableEnv.sqlQuery(sql); tableEnv.toAppendStream(table,Row.class).print(); //6. Start execution tableEnv.execute(Test.class.getSimpleName()); } /** * Parsing Kafka data */ static class BrowseKafkaProcessFunction extends ProcessFunction<String, UserBrowseLog> { @Override public void processElement(String value, Context ctx, Collector<UserBrowseLog> out) throws Exception { try { UserBrowseLog log = JSON.parseObject(value, UserBrowseLog.class); // Add a long timestamp // Specify Beijing time in yyyy MM DD HH: mm: SS format for eventTime DateTimeFormatter format = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); OffsetDateTime eventTime = LocalDateTime.parse(log.getEventTime(), format).atOffset(ZoneOffset.of("+08:00")); // Convert to millisecond timestamp long eventTimeTimestamp = eventTime.toInstant().toEpochMilli(); log.setEventTimeTimestamp(eventTimeTimestamp); out.collect(log); }catch (Exception ex){ log.error("analysis Kafka Data exception...",ex); } } } /** * Extracting timestamp to generate watermark */ static class BrowseTimestampExtractor extends BoundedOutOfOrdernessTimestampExtractor<UserBrowseLog> { BrowseTimestampExtractor(Time maxOutOfOrderness) { super(maxOutOfOrderness); } @Override public long extractTimestamp(UserBrowseLog element) { return element.getEventTimeTimestamp(); } } /** * Parsing Kafka data */ static class ProductInfoProcessFunction extends ProcessFunction<String, ProductInfo> { @Override public void processElement(String value, Context ctx, Collector<ProductInfo> out) throws Exception { try { ProductInfo log = JSON.parseObject(value, ProductInfo.class); // Add a long timestamp // Specify Beijing time in yyyy MM DD HH: mm: SS format for eventTime DateTimeFormatter format = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); OffsetDateTime eventTime = LocalDateTime.parse(log.getUpdatedAt(), format).atOffset(ZoneOffset.of("+08:00")); // Convert to millisecond timestamp long eventTimeTimestamp = eventTime.toInstant().toEpochMilli(); log.setUpdatedAtTimestamp(eventTimeTimestamp); out.collect(log); }catch (Exception ex){ log.error("analysis Kafka Data exception...",ex); } } } /** * Extracting timestamp to generate watermark */ static class ProductInfoTimestampExtractor extends BoundedOutOfOrdernessTimestampExtractor<ProductInfo> { ProductInfoTimestampExtractor(Time maxOutOfOrderness) { super(maxOutOfOrderness); } @Override public long extractTimestamp(ProductInfo element) { return element.getUpdatedAtTimestamp(); } } }
Code example - Flink SQL
Define UDTF:
package com.bigdata.flink.dimJoin; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.typeutils.RowTypeInfo; import org.apache.flink.table.functions.FunctionContext; import org.apache.flink.table.functions.TableFunction; import org.apache.flink.types.Row; import redis.clients.jedis.Jedis; /** * UDTF */ public class UDTFRedis extends TableFunction<Row> { private Jedis jedis; /** * open a connection * @param context * @throws Exception */ @Override public void open(FunctionContext context) throws Exception { jedis = new Jedis("localhost", 6379); jedis.select(0); } /** * Close connection * @throws Exception */ @Override public void close() throws Exception { if (jedis != null) { jedis.close(); } } /** * Find dimension data from Redis * @param key */ public void eval(String key) { String value = jedis.get(key); if (value != null) { String[] valueSplit = value.split(","); Row row = new Row(2); row.setField(0, valueSplit[0]); row.setField(1, Integer.valueOf(valueSplit[1])); collector.collect(row); } } /** * Define the returned data type. The returned data are username and userage, so here it is string and int. * @return */ @Override public TypeInformation<Row> getResultType() { return new RowTypeInfo(Types.STRING, Types.INT); } }
Kafka Join Redis-Dim
package com.bigdata.flink.dimJoin; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.java.StreamTableEnvironment; import org.apache.flink.types.Row; /** * Author: Wang Pei * Summary: * Kafka Join Redis-Dim */ public class KafkaJoinRedisDimWithUDTF { public static void main(String[] args) throws Exception { EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build(); StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings); // Source DDL // Kafka data: {"userID":"user_1","eventType":"click","eventTime":"2015-01-01 00:00:00"} String sourceDDL = "" + "create table source_kafka " + "( " + " userID String, " + " eventType String, " + " eventTime String " + ") with ( " + " 'connector.type' = 'kafka', " + " 'connector.version' = '0.10', " + " 'connector.properties.bootstrap.servers' = 'kafka01:9092', " + " 'connector.properties.zookeeper.connect' = 'kafka01:2181', " + " 'connector.topic' = 'test_1', " + " 'connector.properties.group.id' = 'c1_test_1', " + " 'connector.startup-mode' = 'latest-offset', " + " 'format.type' = 'json' " + ")"; tableEnv.sqlUpdate(sourceDDL); tableEnv.toAppendStream(tableEnv.from("source_kafka"), Row.class).print(); // UDTF DDL // Data in Redis userid, username, userage // 127.0.0.1:6379> get user_1 // "name1,10" String udtfDDL = "" + "CREATE TEMPORARY FUNCTION " + " IF NOT EXISTS UDTFRedis " + " AS 'com.bigdata.flink.dimJoin.UDTFRedis'"; tableEnv.sqlUpdate(udtfDDL); // Query // Left Join String execSQL = "" + "select " + " source_kafka.*,dim.* " + "from source_kafka " + "LEFT JOIN LATERAL TABLE(UDTFRedis(userID)) as dim (userName,userAge) ON TRUE"; Table table = tableEnv.sqlQuery(execSQL); tableEnv.toAppendStream(table, Row.class).print(); tableEnv.execute(KafkaJoinRedisDimWithUDTF.class.getSimpleName()); } }
- be careful
- You need to define UDTF and use the final table syntax.
- It is not very common. If you want to use one UDTF to realize all scenarios of obtaining dimension data from Redis, it is difficult to implement.
- Relying on external storage, when the data changes, it can be obtained in time.
LookupableTableSource
Implementation mode
The data source implements the LookupableTableSource interface.
You can directly register the Lookup table in Flink SQL. You need to register the LookupFunction in the Flink Table API.
In essence, dimension data is obtained through TableFunction.
Code example - Flink Table API
package com.bigdata.flink.tableSqlLookableTableSource; import com.alibaba.fastjson.JSON; import com.bigdata.flink.beans.table.UserBrowseLog; import lombok.extern.slf4j.Slf4j; import org.apache.flink.addons.hbase.HBaseTableSource; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.java.io.jdbc.JDBCLookupOptions; import org.apache.flink.api.java.io.jdbc.JDBCOptions; import org.apache.flink.api.java.io.jdbc.JDBCTableSource; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.api.java.StreamTableEnvironment; import org.apache.flink.table.types.DataType; import org.apache.flink.types.Row; import org.apache.flink.util.Collector; import org.apache.hadoop.conf.Configuration; import java.time.LocalDateTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; import java.util.Properties; /** * Lookup Table Source */ @Slf4j public class Test { public static void main(String[] args) throws Exception{ args=new String[]{"--application","flink/src/main/java/com/bigdata/flink/tableSqlLookableTableSource/application.properties"}; //1. Parsing command line parameters ParameterTool fromArgs = ParameterTool.fromArgs(args); ParameterTool parameterTool = ParameterTool.fromPropertiesFile(fromArgs.getRequired("application")); String kafkaBootstrapServers = parameterTool.getRequired("kafkaBootstrapServers"); String browseTopic = parameterTool.getRequired("browseTopic"); String browseTopicGroupID = parameterTool.getRequired("browseTopicGroupID"); String hbaseZookeeperQuorum = parameterTool.getRequired("hbaseZookeeperQuorum"); String hbaseZnode = parameterTool.getRequired("hbaseZnode"); String hbaseTable = parameterTool.getRequired("hbaseTable"); String mysqlDBUrl = parameterTool.getRequired("mysqlDBUrl"); String mysqlUser = parameterTool.getRequired("mysqlUser"); String mysqlPwd = parameterTool.getRequired("mysqlPwd"); String mysqlTable = parameterTool.getRequired("mysqlTable"); //2. Set up the running environment EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build(); StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings); streamEnv.setParallelism(1); //3. Register Kafka data source //According to the self-made test data, a user clicks on a commodity at a certain time, and the value of the commodity is as follows //{"userID": "user_1", "eventTime": "2016-01-01 10:02:00", "eventType": "browse", "productID": "product_1", "productPrice": 20} Properties browseProperties = new Properties(); browseProperties.put("bootstrap.servers",kafkaBootstrapServers); browseProperties.put("group.id",browseTopicGroupID); DataStream<UserBrowseLog> browseStream=streamEnv .addSource(new FlinkKafkaConsumer010<>(browseTopic, new SimpleStringSchema(), browseProperties)) .process(new BrowseKafkaProcessFunction()); tableEnv.registerDataStream("kafka",browseStream,"userID,eventTime,eventTimeTimestamp,eventType,productID,productPrice"); //tableEnv.toAppendStream(tableEnv.scan("kafka"),Row.class).print(); //4. Register HBase data source (Lookup Table Source) Configuration conf = new Configuration(); conf.set("hbase.zookeeper.quorum", hbaseZookeeperQuorum); conf.set("zookeeper.znode.parent",hbaseZnode); HBaseTableSource hBaseTableSource = new HBaseTableSource(conf, hbaseTable); hBaseTableSource.setRowKey("uid",String.class); hBaseTableSource.addColumn("f1","name",String.class); hBaseTableSource.addColumn("f1","age",Integer.class); tableEnv.registerTableSource("hbase",hBaseTableSource); //Register TableFunction tableEnv.registerFunction("hbaseLookup", hBaseTableSource.getLookupFunction(new String[]{"uid"})); //5. Register Mysql data source (Lookup Table Source) String[] mysqlFieldNames={"pid","productName","productCategory","updatedAt"}; DataType[] mysqlFieldTypes={DataTypes.STRING(),DataTypes.STRING(),DataTypes.STRING(),DataTypes.STRING()}; TableSchema mysqlTableSchema = TableSchema.builder().fields(mysqlFieldNames, mysqlFieldTypes).build(); JDBCOptions jdbcOptions = JDBCOptions.builder() .setDriverName("com.mysql.jdbc.Driver") .setDBUrl(mysqlDBUrl) .setUsername(mysqlUser) .setPassword(mysqlPwd) .setTableName(mysqlTable) .build(); JDBCLookupOptions jdbcLookupOptions = JDBCLookupOptions.builder() .setCacheExpireMs(10 * 1000) //Cache validity .setCacheMaxSize(10) //Maximum number of cached data .setMaxRetryTimes(3) //max retries .build(); JDBCTableSource jdbcTableSource = JDBCTableSource.builder() .setOptions(jdbcOptions) .setLookupOptions(jdbcLookupOptions) .setSchema(mysqlTableSchema) .build(); tableEnv.registerTableSource("mysql",jdbcTableSource); //Register TableFunction tableEnv.registerFunction("mysqlLookup",jdbcTableSource.getLookupFunction(new String[]{"pid"})); //6. Inquiry //Obtain user basic information from HBase table according to userID //Obtain the basic product information from the Mysql table according to the productID String sql = "" + "SELECT " + " userID, " + " eventTime, " + " eventType, " + " productID, " + " productPrice, " + " f1.name AS userName, " + " f1.age AS userAge, " + " productName, " + " productCategory " + "FROM " + " kafka, " + " LATERAL TABLE(hbaseLookup(userID)), " + " LATERAL TABLE (mysqlLookup(productID))"; tableEnv.toAppendStream(tableEnv.sqlQuery(sql),Row.class).print(); //7. Start execution tableEnv.execute(Test.class.getSimpleName()); } /** * Parsing Kafka data */ static class BrowseKafkaProcessFunction extends ProcessFunction<String, UserBrowseLog> { @Override public void processElement(String value, Context ctx, Collector<UserBrowseLog> out) throws Exception { try { UserBrowseLog log = JSON.parseObject(value, UserBrowseLog.class); // Add a long timestamp // Specify Beijing time in yyyy MM DD HH: mm: SS format for eventTime DateTimeFormatter format = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); OffsetDateTime eventTime = LocalDateTime.parse(log.getEventTime(), format).atOffset(ZoneOffset.of("+08:00")); // Convert to millisecond timestamp long eventTimeTimestamp = eventTime.toInstant().toEpochMilli(); log.setEventTimeTimestamp(eventTimeTimestamp); out.collect(log); }catch (Exception ex){ log.error("analysis Kafka Data exception...",ex); } } } }
Code example - Flink SQL
package com.bigdata.flink.dimJoin; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.java.StreamTableEnvironment; import org.apache.flink.types.Row; /** * Author: Wang Pei * Summary: * Kafka Join Mysql-Dim */ public class KafkaJoinMysqlDim { public static void main(String[] args) throws Exception { EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build(); StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings); // Source DDL // Kafka data: {"userID":"user_1","eventType":"click","eventTime":"2015-01-01 00:00:00"} String sourceDDL = "" + "create table source_kafka " + "( " + " userID STRING, " + " eventType STRING, " + " eventTime STRING, " + " proctime AS PROCTIME() " + ") with ( " + " 'connector.type' = 'kafka', " + " 'connector.version' = '0.10', " + " 'connector.properties.bootstrap.servers' = 'kafka01:9092', " + " 'connector.properties.zookeeper.connect' = 'kafka01:2181', " + " 'connector.topic' = 'test_1', " + " 'connector.properties.group.id' = 'c1_test_1', " + " 'connector.startup-mode' = 'latest-offset', " + " 'format.type' = 'json' " + ")"; tableEnv.sqlUpdate(sourceDDL); //tableEnv.toAppendStream(tableEnv.from("source_kafka"), Row.class).print(); // Dim DDL // Mysql dimension data // mysql> select * from t_user_info limit 1; // +--------+----------+---------+ // | userID | userName | userAge | // +--------+----------+---------+ // | user_1 | name1 | 10 | // +--------+----------+---------+ String dimDDL = "" + "CREATE TABLE dim_mysql ( " + " userID STRING, " + " userName STRING, " + " userAge INT " + ") WITH ( " + " 'connector.type' = 'jdbc', " + " 'connector.url' = 'jdbc:mysql://localhost:3306/bigdata', " + " 'connector.table' = 't_user_info', " + " 'connector.driver' = 'com.mysql.jdbc.Driver', " + " 'connector.username' = '****', " + " 'connector.password' = '******' " + ")"; tableEnv.sqlUpdate(dimDDL); // Query // Left Join String execSQL = "" + "SELECT " + " kafka.*,mysql.userName,mysql.userAge " + "FROM " + " source_kafka as kafka" + " LEFT JOIN dim_mysql FOR SYSTEM_TIME AS OF kafka.proctime AS mysql " + " ON kafka.userID = mysql.userID"; Table table = tableEnv.sqlQuery(execSQL); tableEnv.toAppendStream(table, Row.class).print(); tableEnv.execute(KafkaJoinMysqlDim.class.getSimpleName()); } }
- be careful
- The LookupableTableSource interface needs to be implemented.
- More general.
- Relying on external storage, when the data changes, it can be obtained in time.
- Currently only Blink Planner is supported.
Hello, I'm wang Zhiwu, a hard core original author in the field of big data. I have done back-end architecture, data middleware, data platform & Architecture &, algorithm engineering.