Yang, Tong 3f5759eed2 MRS comp-lts 2.0.38.SP20 version
Reviewed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
Co-authored-by: Yang, Tong <yangtong2@huawei.com>
Co-committed-by: Yang, Tong <yangtong2@huawei.com>
2023-01-19 17:08:45 +00:00

72 lines
4.2 KiB
HTML

<a name="mrs_01_24069"></a><a name="mrs_01_24069"></a>
<h1 class="topictitle1">Bootstrapping</h1>
<div id="body32001227"><p id="mrs_01_24069__en-us_topic_0000001219029529_p14274145191513">The bootstrapping function provided by Hudi converts historical tables into Hudi tables without any change by generating Hoodie management files based on historical Parquet tables.</p>
<p id="mrs_01_24069__en-us_topic_0000001219029529_p67991961239">The following shows an example of converting a Hive table in the <strong id="mrs_01_24069__en-us_topic_0000001219029529_b1962514531183">hdfs://hacluster/user/hive/warehouse/pq1 directory</strong> on HDFS to a Hudi table and save it in the <strong id="mrs_01_24069__en-us_topic_0000001219029529_b1663135311811">hdfs://hacluster/tmp/hudi_bootstrap_test</strong> directory.</p>
<div class="note" id="mrs_01_24069__en-us_topic_0000001219029529_note280310501785"><img src="public_sys-resources/note_3.0-en-us.png"><span class="notetitle"> </span><div class="notebody"><ul id="mrs_01_24069__en-us_topic_0000001219029529_ul13771956084"><li id="mrs_01_24069__en-us_topic_0000001219029529_li2771115619816">Bootstrapping and write operations cannot be executed concurrently. Bootstrapping is used only to creat new Hudi tables rather to execute existing Hudi tables.</li><li id="mrs_01_24069__en-us_topic_0000001219029529_li57718566817">Bootstrapping supports only the write operations on COW tables.</li></ul>
</div></div>
<pre class="screen" id="mrs_01_24069__en-us_topic_0000001219029529_screen1761116194310">spark-shell
import collection.JavaConverters._
import org.apache.hadoop.fs.FileSystem
import org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider
import org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers}
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.table.timeline.HoodieTimeline
import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieCompactionConfig, HoodieWriteConfig}
import org.apache.hudi.keygen.SimpleKeyGenerator
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.functions.{col, lit}
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import java.time._
import java.util.Collections
val timestamp = Instant.now.toEpochMilli
val jsc = JavaSparkContext.fromSparkContext(spark.sparkContext)
val numRecords: Int = 100
val srcPath = "hdfs://hacluster/user/hive/warehouse/pq1"
val basePath = "hdfs://hacluster/tmp/hudi_bootstrap_test"
// Hudi configuration information
val commonOpts: Map[String, String] = Map(
HoodieWriteConfig.INSERT_PARALLELISM -&gt; "4",
HoodieWriteConfig.UPSERT_PARALLELISM -&gt; "4",
HoodieWriteConfig.DELETE_PARALLELISM -&gt; "4",
HoodieWriteConfig.BULKINSERT_PARALLELISM -&gt; "4",
HoodieWriteConfig.FINALIZE_WRITE_PARALLELISM -&gt; "4",
HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM -&gt; "4",
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -&gt; "col1",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -&gt; "partition",
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -&gt; "timestamp",
HoodieWriteConfig.TABLE_NAME -&gt; "hoodie_test"
)
// Bootstrapping
val bootstrapDF = spark.emptyDataFrame
bootstrapDF.write.
format("hudi").
options(commonOpts).
option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL).
option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL).
option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, srcPath).
option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, classOf[SimpleKeyGenerator].getName).
mode(SaveMode.Overwrite).
save(basePath)
// Query data after bootstrapping.
var hoodieROViewDF1 = spark.read.format("hudi").load(basePath + "/*")
hoodieROViewDF1.show</pre>
</div>
<div>
<div class="familylinks">
<div class="parentlink"><strong>Parent topic:</strong> <a href="mrs_01_24034.html">Write</a></div>
</div>
</div>