数据仓库与分析FusionInsight MRS Flink DataStream API读写Hudi实践

写过一篇 发表于 2022-11-14 18:33:21

FusionInsight MRS Flink DataStream API读写Hudi实践

摘要：目前Hudi只支持FlinkSQL进行数据读写，但是在实际项目开发中一些客户存在使用Flink DataStream API读写Hudi的诉求。
本文分享自华为云社区《FusionInsight MRS Flink DataStream API读写Hudi实践》，作者： yangxiao_mrs 。
目前Hudi只支持FlinkSQL进行数据读写，但是在实际项目开发中一些客户存在使用Flink DataStream API读写Hudi的诉求。
该实践包含三部分内容：
1）HoodiePipeline.java ，该类将Hudi内核读写接口进行封装，提供Hudi DataStream API。
2）WriteIntoHudi.java ，该类使用 DataStream API将数据写入Hudi。
3）ReadFromHudi.java ，该类使用 DataStream API读取Hudi数据。
1.HoodiePipeline.java 将Hudi内核读写接口进行封装，提供Hudi DataStream API。关键实现逻辑：
第一步：将原来Hudi流表的列名、主键、分区键set后，通过StringBuilder拼接成create table SQL。
第二步：将该hudi流表注册到catalog中。
第三步：将DynamicTable转换为DataStreamProvider后，进行数据produce或者consume。
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ReadableConfig;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.internal.TableEnvironmentImpl;
import org.apache.flink.table.catalog.Catalog;
import org.apache.flink.table.catalog.CatalogTable;
import org.apache.flink.table.catalog.ObjectIdentifier;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.table.connector.sink.DataStreamSinkProvider;
import org.apache.flink.table.connector.source.DataStreamScanProvider;
import org.apache.flink.table.connector.source.ScanTableSource;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.factories.DynamicTableFactory;
import org.apache.flink.table.runtime.connector.sink.SinkRuntimeProviderContext;
import org.apache.flink.table.runtime.connector.source.ScanRuntimeProviderContext;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.HoodieTableFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
*A tool class to construct hoodie flink pipeline.
*
*<p>How to use ?</p>
*Method {@link #builder(String)} returns a pipeline builder. The builder
*can then define the hudi table columns, primary keys and partitions.
*
*<p>An example:</p>
*<pre>
* HoodiePipeline.Builder builder = HoodiePipeline.builder("myTable");
* DataStreamSink<?> sinkStream = builder
*    .column("f0 int")
*    .column("f1 varchar(10)")
*    .column("f2 varchar(20)")
*    .pk("f0,f1")
*    .partition("f2")
*    .sink(input, false);
*</pre>
*/
public class HoodiePipeline {

/**
* Returns the builder for hoodie pipeline construction.
*/
public static Builder builder(String tableName) {
return new Builder(tableName);
}

/**
* Builder for hudi source/sink pipeline construction.
*/
public static class Builder {
   private final String tableName;
   private final List<String> columns;
   private final Map<String, String> options;

   private String pk;
   private List<String> partitions;

   private Builder(String tableName) {
   this.tableName = tableName;
   this.columns = new ArrayList<>();
   this.options = new HashMap<>();
   this.partitions = new ArrayList<>();
   }

   /**
   * Add a table column definition.
   *
   * @param column the column format should be in the form like 'f0 int'
   */
   public Builder column(String column) {
   this.columns.add(column);
   return this;
   }

   /**
   * Add primary keys.
   */
   public Builder pk(String... pks) {
   this.pk = String.join(",", pks);
   return this;
   }

   /**
   * Add partition fields.
   */
   public Builder partition(String... partitions) {
   this.partitions = new ArrayList<>(Arrays.asList(partitions));
   return this;
   }

   /**
   * Add a config option.
   */
   public Builder option(ConfigOption<?> option, Object val) {
   this.options.put(option.key(), val.toString());
   return this;
   }

   public Builder option(String key, Object val) {
   this.options.put(key, val.toString());
   return this;
   }

   public Builder options(Map<String, String> options) {
   this.options.putAll(options);
   return this;
   }

   public DataStreamSink<?> sink(DataStream<RowData> input, boolean bounded) {
   TableDescriptor tableDescriptor = getTableDescriptor();
   return HoodiePipeline.sink(input, tableDescriptor.getTableId(), tableDescriptor.getCatalogTable(), bounded);
   }

   public TableDescriptor getTableDescriptor() {
   EnvironmentSettings environmentSettings = EnvironmentSettings
         .newInstance()
         .build();
   TableEnvironmentImpl tableEnv = TableEnvironmentImpl.create(environmentSettings);
   String sql = getCreateHoodieTableDDL(this.tableName, this.columns, this.options, this.pk, this.partitions);
   tableEnv.executeSql(sql);
   String currentCatalog = tableEnv.getCurrentCatalog();
   CatalogTable catalogTable = null;
   String defaultDatabase = null;
   try {
         Catalog catalog = tableEnv.getCatalog(currentCatalog).get();
         defaultDatabase = catalog.getDefaultDatabase();
         catalogTable = (CatalogTable) catalog.getTable(new ObjectPath(defaultDatabase, this.tableName));
   } catch (TableNotExistException e) {
         throw new HoodieException("Create table " + this.tableName + " exception", e);
   }
   ObjectIdentifier tableId = ObjectIdentifier.of(currentCatalog, defaultDatabase, this.tableName);
   return new TableDescriptor(tableId, catalogTable);
   }

   public DataStream<RowData> source(StreamExecutionEnvironment execEnv) {
   TableDescriptor tableDescriptor = getTableDescriptor();
   return HoodiePipeline.source(execEnv, tableDescriptor.tableId, tableDescriptor.getCatalogTable());
   }
}

private static String getCreateHoodieTableDDL(
   String tableName,
   List<String> fields,
   Map<String, String> options,
   String pkField,
   List<String> partitionField) {
   StringBuilder builder = new StringBuilder();
   builder.append("create table ")
      .append(tableName)
      .append("(\n");
   for (String field : fields) {
   builder.append("")
         .append(field)
         .append(",\n");
   }
   builder.append("PRIMARY KEY(")
      .append(pkField)
      .append(") NOT ENFORCED\n")
      .append(")\n");
   if (!partitionField.isEmpty()) {
   String partitons = partitionField
         .stream()
         .map(partitionName -> "`" + partitionName + "`")
         .collect(Collectors.joining(","));
   builder.append("PARTITIONED BY (")
         .append(partitons)
         .append(")\n");
   }
   builder.append("with ('connector' = 'hudi'");
   options.forEach((k, v) -> builder
      .append(",\n")
      .append("'")
      .append(k)
      .append("' = '")
      .append(v)
      .append("'"));
   builder.append("\n)");

   System.out.println(builder.toString());
   return builder.toString();
}

/**
* Returns the data stream sink with given catalog table.
*
* @param input    The input datastream
* @param tablePath The table path to the hoodie table in the catalog
* @param catalogTable The hoodie catalog table
* @param isBounded A flag indicating whether the input data stream is bounded
*/
private static DataStreamSink<?> sink(DataStream<RowData> input, ObjectIdentifier tablePath, CatalogTable catalogTable, boolean isBounded) {
   DefaultDynamicTableContext context = new DefaultDynamicTableContext(tablePath, catalogTable,
      Configuration.fromMap(catalogTable.getOptions()), Thread.currentThread().getContextClassLoader(), false);
   HoodieTableFactory hoodieTableFactory = new HoodieTableFactory();
   return ((DataStreamSinkProvider) hoodieTableFactory.createDynamicTableSink(context)
      .getSinkRuntimeProvider(new SinkRuntimeProviderContext(isBounded)))
      .consumeDataStream(input);
}

/**
* Returns the data stream source with given catalog table.
*
* @param execEnv    The execution environment
* @param tablePath The table path to the hoodie table in the catalog
* @param catalogTable The hoodie catalog table
*/
private static DataStream<RowData> source(StreamExecutionEnvironment execEnv, ObjectIdentifier tablePath, CatalogTable catalogTable) {
   DefaultDynamicTableContext context = new DefaultDynamicTableContext(tablePath, catalogTable,
      Configuration.fromMap(catalogTable.getOptions()), Thread.currentThread().getContextClassLoader(), false);
   HoodieTableFactory hoodieTableFactory = new HoodieTableFactory();
   DataStreamScanProvider dataStreamScanProvider = (DataStreamScanProvider) ((ScanTableSource) hoodieTableFactory
      .createDynamicTableSource(context))
      .getScanRuntimeProvider(new ScanRuntimeProviderContext());
   returndataStreamScanProvider.produceDataStream(execEnv);
}

/***
*A POJO that contains tableId and resolvedCatalogTable.
*/
public static class TableDescriptor {
   private ObjectIdentifier tableId;
   private CatalogTable catalogTable;

   public TableDescriptor(ObjectIdentifier tableId, CatalogTable catalogTable) {
      this.tableId = tableId;
      this.catalogTable = catalogTable;
   }

   public ObjectIdentifier getTableId() {
      return tableId;
   }

   public CatalogTable getCatalogTable() {
         return catalogTable;
   }
}

private static class DefaultDynamicTableContext implements DynamicTableFactory.Context {

   private final ObjectIdentifier objectIdentifier;
   private final CatalogTable catalogTable;
   private final ReadableConfig configuration;
   private final ClassLoader classLoader;
   private final boolean isTemporary;

   DefaultDynamicTableContext(
   ObjectIdentifier objectIdentifier,
   CatalogTable catalogTable,
   ReadableConfig configuration,
   ClassLoader classLoader,
   boolean isTemporary) {
   this.objectIdentifier = objectIdentifier;
   this.catalogTable = catalogTable;
   this.configuration = configuration;
   this.classLoader = classLoader;
   this.isTemporary = isTemporary;
   }

   @Override
   public ObjectIdentifier getObjectIdentifier() {
   return objectIdentifier;
   }

   @Override
   public CatalogTable getCatalogTable() {
   return catalogTable;
   }

   @Override
   public ReadableConfig getConfiguration() {
   return configuration;
   }

   @Override
   public ClassLoader getClassLoader() {
   return classLoader;
   }

   @Override
   public boolean isTemporary() {
         return isTemporary;
   }
}
}2.WriteIntoHudi.java 使用 DataStream API将数据写入Hudi。关键实现逻辑：
第一步：Demo中的数据源来自datagen connector Table。
第二步：使用toAppendStream将Table转化为Stream。
第三步：build hudi sink stream后写入Hudi。
在项目实践中也可以直接使用DataStream源写入Hudi。
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;

import java.util.HashMap;
import java.util.Map;

public class WriteIntoHudi {
public static void main(String[] args) throws Exception {

   StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
   StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
   env.getCheckpointConfig().setCheckpointInterval(10000);

   tableEnv.executeSql("CREATE TABLE datagen (\n"
         + "uuid varchar(20),\n"
         + "name varchar(10),\n"
         + "age int,\n"
         + "ts timestamp(3),\n"
         + "p varchar(20)\n"
         + ") WITH (\n"
         + "'connector' = 'datagen',\n"
         + "'rows-per-second' = '5'\n"
         + ")");

   Table table = tableEnv.sqlQuery("SELECT * FROM datagen");

   DataStream<RowData> dataStream = tableEnv.toAppendStream(table, RowData.class);
   String targetTable = "hudiSinkTable";

   String basePath = "hdfs://hacluster/tmp/flinkHudi/hudiTable";

   Map<String, String> options = new HashMap<>();
   options.put(FlinkOptions.PATH.key(), basePath);
   options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
   options.put(FlinkOptions.PRECOMBINE_FIELD.key(), "ts");
   options.put(FlinkOptions.INDEX_BOOTSTRAP_ENABLED.key(), "true");

   HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
         .column("uuid VARCHAR(20)")
         .column("name VARCHAR(10)")
         .column("age INT")
         .column("ts TIMESTAMP(3)")
         .column("p VARCHAR(20)")
         .pk("uuid")
         .partition("p")
         .options(options);

   builder.sink(dataStream, false); // The second parameter indicating whether the input data stream is bounded
   env.execute("Api_Sink");
}
}3.ReadFromHudi.java 使用 DataStream API读取Hudi数据。关键实现逻辑：
第一步：build hudi source stream读取hudi数据。
第二步：使用fromDataStream将stream转化为table。
第三步：将Hudi table的数据使用print connector打印输出。
在项目实践中也可以直接读取Hudi数据后写入sink DataStream。
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;

import java.util.HashMap;
import java.util.Map;

public class ReadFromHudi {
public static void main(String[] args) throws Exception {
   StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
   String targetTable = "hudiSourceTable";
   String basePath = "hdfs://hacluster/tmp/flinkHudi/hudiTable";

   Map<String, String> options = new HashMap<>();
   options.put(FlinkOptions.PATH.key(), basePath);
   options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
   options.put(FlinkOptions.READ_AS_STREAMING.key(), "true"); // this option enable the streaming read
   options.put("read.streaming.start-commit", "20210316134557"); // specifies the start commit instant time

   HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
         .column("uuid VARCHAR(20)")
         .column("name VARCHAR(10)")
         .column("age INT")
         .column("ts TIMESTAMP(3)")
         .column("p VARCHAR(20)")
         .pk("uuid")
         .partition("p")
         .options(options);

   DataStream<RowData> rowDataDataStream = builder.source(env);

   StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
   Table table = tableEnv.fromDataStream(rowDataDataStream,"uuid, name, age, ts, p");

   tableEnv.registerTable("hudiSourceTable",table);

   tableEnv.executeSql("CREATE TABLE print("
         + " uuid varchar(20),\n"
         + " name varchar(10),\n"
         + " age int,\n"
         + " ts timestamp(3),\n"
         + " p varchar(20)\n"
         + ") WITH (\n"
         + " 'connector' = 'print'\n"
         + ")");

   tableEnv.executeSql("insert into print select * from hudiSourceTable");
   env.execute("Api_Source");
}
}4.在项目实践中如果有解析Kafka复杂Json的需求：
1）使用FlinkSQL: https://bbs.huaweicloud.com/forum/thread-153494-1-1.html
2）使用Flink DataStream MapFunction实现。
点击关注，第一时间了解华为云新鲜技术~

免责声明：如果侵犯了您的权益，请联系站长，我们会及时删除侵权内容，谢谢合作！

页: [1]

ToB企服应用市场:ToB评测及商务社交产业平台's Archiver

FusionInsight MRS Flink DataStream API读写Hudi实践