Starrocks扩展FileSystem代码分析 - ToB企服应用市场:ToB评测及商务社交产业平台

/**
* Process one mysql connection, receive one pakcet, process, send one packet.
*/
public class ConnectProcessor {
...
// process COM_QUERY statement,
protected void handleQuery() {
...
originStmt = new String(bytes, 1, ending, StandardCharsets.UTF_8);
...
try {
...
try {
stmts = com.starrocks.sql.parser.SqlParser.parse(originStmt, ctx.getSessionVariable());
} catch (ParsingException parsingException) {
throw new AnalysisException(parsingException.getMessage());
}
for (int i = 0; i < stmts.size(); ++i) {
...
parsedStmt = stmts.get(i);
...
executor = new StmtExecutor(ctx, parsedStmt);
...
executor.execute();
...
}
}
...
}
...
}

复制代码

public class StmtExecutor {
...
public void execute() throws Exception {
...
try {
...
try (Timer ignored = Tracers.watchScope("Total")) {
...
if (!isForwardToLeader()) {
...
if {
...
} else {
execPlan = StatementPlanner.plan(parsedStmt, context);
if (parsedStmt instanceof QueryStatement && context.shouldDumpQuery()) {
context.getDumpInfo().setExplainInfo(execPlan.getExplainString(TExplainLevel.COSTS));
}
}
...
}
}
...
}
...
}
...
if {
...
} else if (parsedStmt instanceof DmlStmt) {
handleDMLStmtWithProfile(execPlan, (DmlStmt) parsedStmt);
} ...
}

复制代码

public class StatementPlanner {
public static ExecPlan plan(StatementBase stmt, ConnectContext session) {
if (session instanceof HttpConnectContext) {
return plan(stmt, session, TResultSinkType.HTTP_PROTOCAL);
}
return plan(stmt, session, TResultSinkType.MYSQL_PROTOCAL);
}
public static ExecPlan plan(StatementBase stmt, ConnectContext session,
TResultSinkType resultSinkType) {
...
try {
...
try (Timer ignored = Tracers.watchScope("Analyzer")) {
Analyzer.analyze(stmt, session);
}
...
if (stmt instanceof QueryStatement) {
return planQuery(stmt, resultSinkType, session, false);
} else if (stmt instanceof InsertStmt) {
return new InsertPlanner().plan((InsertStmt) stmt, session);
} else if (stmt instanceof UpdateStmt) {
return new UpdatePlanner().plan((UpdateStmt) stmt, session);
} else if (stmt instanceof DeleteStmt) {
return new DeletePlanner().plan((DeleteStmt) stmt, session);
}
}
...
}
}

复制代码

public class Analyzer {
private static final Analyzer INSTANCE = new Analyzer(new AnalyzerVisitor());
public static Analyzer getInstance() {
return INSTANCE;
}
private final AnalyzerVisitor analyzerVisitor;
private Analyzer(AnalyzerVisitor analyzerVisitor) {
this.analyzerVisitor = analyzerVisitor;
}
public static void analyze(StatementBase statement, ConnectContext context) {
getInstance().analyzerVisitor.analyze(statement, context);
}
}

复制代码

public class InsertPlanner {
...
public ExecPlan plan(InsertStmt insertStmt, ConnectContext session) {
...
// 语法树转换成逻辑计划
try (Timer ignore = Tracers.watchScope("Transform")) {
logicalPlan = new RelationTransformer(columnRefFactory, session).transform(queryRelation);
}
...
try (Timer ignore = Tracers.watchScope("InsertPlanner")) {
...
// 优化器执行优化输出物理计划
OptExpression optimizedPlan;
try (Timer ignore2 = Tracers.watchScope("Optimizer")) {
optimizedPlan = optimizer.optimize(
session,
logicalPlan.getRoot(),
requiredPropertySet,
new ColumnRefSet(logicalPlan.getOutputColumn()),
columnRefFactory);
}
...
// 将物理计划划分后生成执行计划
ExecPlan execPlan;
try (Timer ignore3 = Tracers.watchScope("PlanBuilder")) {
execPlan = PlanFragmentBuilder.createPhysicalPlan(
optimizedPlan, session, logicalPlan.getOutputColumn(), columnRefFactory,
queryRelation.getColumnOutputNames(), TResultSinkType.MYSQL_PROTOCAL, hasOutputFragment);
}
...
// 如果targetTable是TableFunctionTable，就设置执行计划的sink节点为TableFunctionTableSink
DataSink dataSink;
if (targetTable instanceof ...) {
} else if (targetTable instanceof TableFunctionTable) {
dataSink = new TableFunctionTableSink((TableFunctionTable) targetTable);
}
...
PlanFragment sinkFragment = execPlan.getFragments().get(0);
...
sinkFragment.setSink(dataSink);
}
...
}
...
}

复制代码

insertStatement
: explainDesc? INSERT setVarHint* (INTO | OVERWRITE) (qualifiedName | (FILES propertyList)) partitionNames?
(WITH LABEL label=identifier)? columnAliases?
(queryStatement | (VALUES expressionsWithDefault (',' expressionsWithDefault)*))
;

复制代码

@Override
public ParseNode visitInsertStatement(StarRocksParser.InsertStatementContext context) {
...
// INSERT INTO FILES(...)
Map<String, String> tableFunctionProperties = getPropertyList(context.propertyList());
InsertStmt res = new InsertStmt(tableFunctionProperties, queryStatement, createPos(context));
res.setOptHints(visitVarHints(context.setVarHint()));
return res;
}

复制代码

public class InsertStmt extends DmlStmt {
...
// Ctor for INSERT INTO FILES(...)
public InsertStmt(Map<String, String> tableFunctionProperties, QueryStatement queryStatement, NodePosition pos) {
super(pos);
this.tblName = new TableName("table_function_catalog", "table_function_db", "table_function_table");
this.targetColumnNames = null;
this.targetPartitionNames = null;
this.queryStatement = queryStatement;
this.tableFunctionAsTargetTable = true;
this.tableFunctionProperties = tableFunctionProperties;
}
...
}

复制代码

public class AnalyzerVisitor extends AstVisitor<Void, ConnectContext> {
public void analyze(StatementBase statement, ConnectContext session) {
visit(statement, session);
}
...
@Override
public Void visitInsertStatement(InsertStmt statement, ConnectContext session) {
InsertAnalyzer.analyze(statement, session);
return null;
}
....
}

复制代码

public class InsertAnalyzer {
public static void analyze(InsertStmt insertStmt, ConnectContext session) {
QueryRelation query = insertStmt.getQueryStatement().getQueryRelation();
new QueryAnalyzer(session).analyze(insertStmt.getQueryStatement());
List<Table> tables = new ArrayList<>();
AnalyzerUtils.collectSpecifyExternalTables(insertStmt.getQueryStatement(), tables, Table::isHiveTable);
tables.stream().map(table -> (HiveTable) table)
.forEach(table -> table.useMetadataCache(false));
/*
* Target table
*/
Table table = getTargetTable(insertStmt, session);
...
insertStmt.setTargetTable(table);
insertStmt.setTargetColumns(targetColumns);
if (session.getDumpInfo() != null) {
session.getDumpInfo().addTable(insertStmt.getTableName().getDb(), table);
}
}
...
private static Table getTargetTable(InsertStmt insertStmt, ConnectContext session) {
if (insertStmt.useTableFunctionAsTargetTable()) {
return insertStmt.makeTableFunctionTable();
}
...
}
...
}

复制代码

public class InsertStmt extends DmlStmt {
...
public Table makeTableFunctionTable() {
...
// parse table function properties
Map<String, String> props = getTableFunctionProperties();
String single = props.getOrDefault("single", "false");
if (!single.equalsIgnoreCase("true") && !single.equalsIgnoreCase("false")) {
throw new SemanticException("got invalid parameter "single" = "%s", expect a boolean value (true or false).",
single);
}
boolean writeSingleFile = single.equalsIgnoreCase("true");
String path = props.get("path");
String format = props.get("format");
String partitionBy = props.get("partition_by");
String compressionType = props.get("compression");
...
if (writeSingleFile) {
return new TableFunctionTable(path, format, compressionType, columns, null, true, props);
}
if (partitionBy == null) {
// prepend `data_` if path ends with forward slash
if (path.endsWith("/")) {
path += "data_";
}
return new TableFunctionTable(path, format, compressionType, columns, null, false, props);
}
...
return new TableFunctionTable(path, format, compressionType, columns, partitionColumnIDs, false, props);
}
}

复制代码

public class PlanFragment extends TreeNode<PlanFragment> {
...
public TPlanFragment toThrift() {
TPlanFragment result = new TPlanFragment();
...
if (sink != null) {
result.setOutput_sink(sink.toThrift());
}
...
return result;
}
...
}

复制代码

public class TableFunctionTableSink extends DataSink {
...
@Override
protected TDataSink toThrift() {
TTableFunctionTableSink tTableFunctionTableSink = new TTableFunctionTableSink();
tTableFunctionTableSink.setTarget_table(table.toTTableFunctionTable());
TCloudConfiguration tCloudConfiguration = new TCloudConfiguration();
cloudConfiguration.toThrift(tCloudConfiguration);
tTableFunctionTableSink.setCloud_configuration(tCloudConfiguration);
// 设置Sink类型为TABLE_FUNCTION_TABLE_SINK
TDataSink tDataSink = new TDataSink(TDataSinkType.TABLE_FUNCTION_TABLE_SINK);
tDataSink.setTable_function_table_sink(tTableFunctionTableSink);
return tDataSink;
}
...
}

复制代码

public interface PBackendService {
@ProtobufRPC(serviceName = "PBackendService", methodName = "exec_plan_fragment",
attachmentHandler = ThriftClientAttachmentHandler.class, onceTalkTimeout = 60000)
Future<PExecPlanFragmentResult> execPlanFragmentAsync(PExecPlanFragmentRequest request);
...
}

复制代码

template <typename T>
class PInternalServiceImplBase : public T {
public:
...
void exec_plan_fragment(google::protobuf::RpcController* controller, const PExecPlanFragmentRequest* request,
PExecPlanFragmentResult* result, google::protobuf::Closure* done) override;
...
}

复制代码

template <typename T>
Status PInternalServiceImplBase<T>::_exec_plan_fragment_by_pipeline(const TExecPlanFragmentParams& t_common_param,
const TExecPlanFragmentParams& t_unique_request) {
pipeline::FragmentExecutor fragment_executor;
auto status = fragment_executor.prepare(_exec_env, t_common_param, t_unique_request);
if (status.ok()) {
return fragment_executor.execute(_exec_env);
} else {
return status.is_duplicate_rpc_invocation() ? Status::OK() : status;
}
}

复制代码

Status FragmentExecutor::prepare(ExecEnv* exec_env, const TExecPlanFragmentParams& common_request,
const TExecPlanFragmentParams& unique_request) {
...
{
SCOPED_RAW_TIMER(&profiler.prepare_runtime_state_time);
RETURN_IF_ERROR(_prepare_workgroup(request));
RETURN_IF_ERROR(_prepare_runtime_state(exec_env, request));
// thrift对象转成BE执行计划树
RETURN_IF_ERROR(_prepare_exec_plan(exec_env, request));
RETURN_IF_ERROR(_prepare_global_dict(request));
}
{
SCOPED_RAW_TIMER(&profiler.prepare_pipeline_driver_time);
// 准备pipeline driver，解析sink节点
RETURN_IF_ERROR(_prepare_pipeline_driver(exec_env, request));
RETURN_IF_ERROR(_prepare_stream_load_pipe(exec_env, request));
}
...
}
Status FragmentExecutor::_prepare_pipeline_driver(ExecEnv* exec_env, const UnifiedExecPlanFragmentParams& request) {
...
std::unique_ptr<DataSink> datasink;
if (request.isset_output_sink()) {
const auto& tsink = request.output_sink();
...
RETURN_IF_ERROR(DataSink::create_data_sink(runtime_state, tsink, fragment.output_exprs, params,
request.sender_id(), plan->row_desc(), &datasink));
// 将fe的sink节点转换成BE的TableFunctionTableSinkOperatorFactory
RETURN_IF_ERROR(_decompose_data_sink_to_operator(runtime_state, &context, request, datasink, tsink,
fragment.output_exprs));
}
...
// 这里将调用pipeline里面所有factory生成真正的BE operator，在这里就会生成TableFunctionTableSinkOperator
if (!unready_pipeline_groups.empty()) {
RETURN_IF_ERROR(create_lazy_instantiate_drivers_pipeline(
runtime_state, &context, _query_ctx, _fragment_ctx.get(), std::move(unready_pipeline_groups), drivers));
}
...
}
Status FragmentExecutor::_decompose_data_sink_to_operator(RuntimeState* runtime_state, PipelineBuilderContext* context,
const UnifiedExecPlanFragmentParams& request,
std::unique_ptr<starrocks::DataSink>& datasink,
const TDataSink& thrift_sink,
const std::vector<TExpr>& output_exprs) {
...
if (typeid(*datasink) == ...) {
...
} else if (typeid(*datasink) == typeid(starrocks::TableFunctionTableSink)) {
...
auto op = std::make_shared<TableFunctionTableSinkOperatorFactory>(
context->next_operator_id(), target_table.path, target_table.file_format, target_table.compression_type,
output_expr_ctxs, partition_expr_ctxs, column_names, partition_column_names,
target_table.write_single_file, thrift_sink.table_function_table_sink.cloud_configuration,
fragment_ctx);
...
}
}

复制代码

Status TableFunctionTableSinkOperator::push_chunk(RuntimeState* state, const ChunkPtr& chunk) {
if (_partition_exprs.empty()) {
if (_partition_writers.empty()) {
auto writer = std::make_unique<RollingAsyncParquetWriter>(_make_table_info(_path), _output_exprs,
_common_metrics.get(), add_commit_info, state,
_driver_sequence);
RETURN_IF_ERROR(writer->init());
_partition_writers.insert({"default writer", std::move(writer)});
}
return _partition_writers["default writer"]->append_chunk(chunk.get(), state);
}
...
return _partition_writers[partition_location]->append_chunk(chunk.get(), state);
}

复制代码

Status RollingAsyncParquetWriter::init() {
ASSIGN_OR_RETURN(
_fs, FileSystem::CreateUniqueFromString(_table_info.partition_location, FSOptions(&_table_info.cloud_conf)))
_schema = _table_info.schema;
_partition_location = _table_info.partition_location;
::parquet::WriterProperties::Builder builder;
_table_info.enable_dictionary ? builder.enable_dictionary() : builder.disable_dictionary();
ASSIGN_OR_RETURN(auto compression_codec,
parquet::ParquetBuildHelper::convert_compression_type(_table_info.compress_type));
builder.compression(compression_codec);
builder.version(::parquet::ParquetVersion::PARQUET_2_0);
_properties = builder.build();
return Status::OK();
}

复制代码

StatusOr<std::unique_ptr<FileSystem>> FileSystem::CreateUniqueFromString(std::string_view uri, FSOptions options) {
if (fs::is_posix_uri(uri)) {
return new_fs_posix();
}
if (fs::is_s3_uri(uri)) {
return new_fs_s3(options);
}
if (fs::is_azure_uri(uri) || fs::is_gcs_uri(uri)) {
// TODO(SmithCruise):
// Now Azure storage and Google Cloud Storage both are using LibHdfs, we can use cpp sdk instead in the future.
return new_fs_hdfs(options);
}
#ifdef USE_STAROS
if (is_starlet_uri(uri)) {
return new_fs_starlet();
}
#endif
// Since almost all famous storage are compatible with Hadoop FileSystem, it's always a choice to fallback using
// Hadoop FileSystem to access storage.
return new_fs_hdfs(options);
}

复制代码

class FileSystem {
public:
enum Type { POSIX, S3, HDFS, BROKER, MEMORY, STARLET };
// Governs if/how the file is created.
//
// enum value | file exists | file does not exist
// -----------------------------+-------------------+--------------------
// CREATE_OR_OPEN_WITH_TRUNCATE | opens + truncates | creates
// CREATE_OR_OPEN | opens | creates
// MUST_CREATE | fails | creates
// MUST_EXIST | opens | fails
enum OpenMode { CREATE_OR_OPEN_WITH_TRUNCATE, CREATE_OR_OPEN, MUST_CREATE, MUST_EXIST };
...
// Create a brand new sequentially-readable file with the specified name.
// If the file does not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
StatusOr<std::unique_ptr<SequentialFile>> new_sequential_file(const std::string& fname) {
return new_sequential_file(SequentialFileOptions(), fname);
}
virtual StatusOr<std::unique_ptr<SequentialFile>> new_sequential_file(const SequentialFileOptions& opts,
const std::string& fname) = 0;
// Create a brand new random access read-only file with the
// specified name.
//
// The returned file will only be accessed by one thread at a time.
StatusOr<std::unique_ptr<RandomAccessFile>> new_random_access_file(const std::string& fname) {
return new_random_access_file(RandomAccessFileOptions(), fname);
}
virtual StatusOr<std::unique_ptr<RandomAccessFile>> new_random_access_file(const RandomAccessFileOptions& opts,
const std::string& fname) = 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file.
//
// The returned file will only be accessed by one thread at a time.
virtual StatusOr<std::unique_ptr<WritableFile>> new_writable_file(const std::string& fname) = 0;
// Like the previous new_writable_file, but allows options to be
// specified.
virtual StatusOr<std::unique_ptr<WritableFile>> new_writable_file(const WritableFileOptions& opts,
const std::string& fname) = 0;
// Returns OK if the path exists.
// NotFound if the named file does not exist,
// the calling process does not have permission to determine
// whether this file exists, or if the path is invalid.
// IOError if an IO Error was encountered
virtual Status path_exists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
// Returns OK if "dir" exists and "*result" contains its children.
// NotFound if "dir" does not exist, the calling process does not have
// permission to access "dir", or if "dir" is invalid.
// IOError if an IO Error was encountered
virtual Status get_children(const std::string& dir, std::vector<std::string>* result) = 0;
// Iterate the specified directory and call given callback function with child's
// name. This function continues execution until all children have been iterated
// or callback function return false.
// The names are relative to "dir".
//
// The function call extra cost is acceptable. Compared with returning all children
// into a given vector, the performance of this method is 5% worse. However this
// approach is more flexiable and efficient in fulfilling other requirements.
//
// Returns OK if "dir" exists.
// NotFound if "dir" does not exist, the calling process does not have
// permission to access "dir", or if "dir" is invalid.
// IOError if an IO Error was encountered
virtual Status iterate_dir(const std::string& dir, const std::function<bool(std::string_view)>& cb) = 0;
// `iterate_dir2` is similar to `iterate_dir` but in addition to returning the directory entry name, it
// also returns some file statistics.
virtual Status iterate_dir2(const std::string& dir, const std::function<bool(DirEntry)>& cb) = 0;
// Delete the named file.
// FIXME: If the named file does not exist, OK or NOT_FOUND is returned, depend on the implementation.
virtual Status delete_file(const std::string& fname) = 0;
// Create the specified directory.
// NOTE: It will return error if the path already exist(not necessarily as a directory)
virtual Status create_dir(const std::string& dirname) = 0;
// Creates directory if missing.
// Return OK if it exists, or successful in Creating.
virtual Status create_dir_if_missing(const std::string& dirname, bool* created = nullptr) = 0;
// Create directory for every element of 'dirname' that does not already exist.
// If 'dirname' already exists, the function does nothing (this condition is not treated as an error).
virtual Status create_dir_recursive(const std::string& dirname) = 0;
// Delete the specified directory.
// NOTE: The dir must be empty.
virtual Status delete_dir(const std::string& dirname) = 0;
// Deletes the contents of 'dirname' (if it is a directory) and the contents of all its subdirectories,
// recursively, then deletes 'dirname' itself. Symlinks are not followed (symlink is removed, not its target).
virtual Status delete_dir_recursive(const std::string& dirname) = 0;
// Synchronize the entry for a specific directory.
virtual Status sync_dir(const std::string& dirname) = 0;
// Checks if the file is a directory. Returns an error if it doesn't
// exist, otherwise return true or false.
virtual StatusOr<bool> is_directory(const std::string& path) = 0;
// Canonicalize 'path' by applying the following conversions:
// - Converts a relative path into an absolute one using the cwd.
// - Converts '.' and '..' references.
// - Resolves all symbolic links.
//
// All directory entries in 'path' must exist on the filesystem.
virtual Status canonicalize(const std::string& path, std::string* result) = 0;
virtual StatusOr<uint64_t> get_file_size(const std::string& fname) = 0;
// Get the last modification time by given 'fname'.
virtual StatusOr<uint64_t> get_file_modified_time(const std::string& fname) = 0;
// Rename file src to target.
virtual Status rename_file(const std::string& src, const std::string& target) = 0;
// create a hard-link
virtual Status link_file(const std::string& /*old_path*/, const std::string& /*new_path*/) = 0;
// Determines the information about the filesystem on which the pathname 'path' is located.
virtual StatusOr<SpaceInfo> space(const std::string& path) { return Status::NotSupported("FileSystem::space()"); }
// Given the path to a remote file, delete the file's cache on the local file system, if any.
// On success, Status::OK is returned. If there is no cache, Status::NotFound is returned.
virtual Status drop_local_cache(const std::string& path) { return Status::NotFound(path); }
// Batch delete the given files.
// return ok if all success (not found error ignored), error if any failed and the message indicates the fail message
// possibly stop at the first error if is simulating batch deletes.
virtual Status delete_files(const std::vector<std::string>& paths) {
for (auto&& path : paths) {
auto st = delete_file(path);
if (!st.ok() && !st.is_not_found()) {
return st;
}
}
return Status::OK();
}
};

复制代码

Status RollingAsyncParquetWriter::append_chunk(Chunk* chunk, RuntimeState* state) {
RETURN_IF_ERROR(get_io_status());
if (_writer == nullptr) {
RETURN_IF_ERROR(_new_file_writer());
}
// exceed file size
if (_max_file_size != -1 && _writer->file_size() > _max_file_size) {
RETURN_IF_ERROR(close_current_writer(state));
RETURN_IF_ERROR(_new_file_writer());
}
return _writer->write(chunk);
}
Status RollingAsyncParquetWriter::_new_file_writer() {
std::string new_file_location = _new_file_location();
WritableFileOptions options{.sync_on_close = false, .mode = FileSystem::CREATE_OR_OPEN_WITH_TRUNCATE};
ASSIGN_OR_RETURN(auto writable_file, _fs->new_writable_file(options, new_file_location))
_writer = std::make_shared<starrocks::parquet::AsyncFileWriter>(
std::move(writable_file), new_file_location, _partition_location, _properties, _schema, _output_expr_ctxs,
ExecEnv::GetInstance()->pipeline_sink_io_pool(), _parent_profile, _max_file_size);
auto st = _writer->init();
return st;
}

复制代码

Status FileWriterBase::init() {
_writer = ::parquet::ParquetFileWriter::Open(_outstream, _schema, _properties);
if (_writer == nullptr) {
return Status::InternalError("Failed to create file writer");
}
return Status::OK();
}
void FileWriterBase::_generate_chunk_writer() {
DCHECK(_writer != nullptr);
if (_chunk_writer == nullptr) {
auto rg_writer = _writer->AppendBufferedRowGroup();
_chunk_writer = std::make_unique<ChunkWriter>(rg_writer, _type_descs, _schema, _eval_func);
}
}
Status FileWriterBase::write(Chunk* chunk) {
if (!chunk->has_rows()) {
return Status::OK();
}
_generate_chunk_writer();
RETURN_IF_ERROR(_chunk_writer->write(chunk));
if (_chunk_writer->estimated_buffered_bytes() > _max_row_group_size && !is_last_row_group()) {
RETURN_IF_ERROR(_flush_row_group());
}
return Status::OK();
}

复制代码