Source code for pyflink.datastream.formats.orc

################################################################################
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
from typing import Optional, TYPE_CHECKING

from pyflink.common import Configuration
from pyflink.common.serialization import BulkWriterFactory, RowDataBulkWriterFactory
from pyflink.datastream.utils import create_hadoop_configuration, create_java_properties
from pyflink.java_gateway import get_gateway
from pyflink.util.java_utils import to_jarray

if TYPE_CHECKING:
    from pyflink.table.types import RowType

__all__ = [
    'OrcBulkWriters'
]


[docs]class OrcBulkWriters(object): """ Convenient builder to create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records with a predefined schema into Orc files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, OrcBulkWriters.for_row_type( ... row_type=row_type, ... writer_properties=Configuration(), ... hadoop_config=Configuration(), ... ) ... ).build() >>> ds.sink_to(sink) .. versionadded:: 1.16.0 """ @staticmethod def for_row_type(row_type: 'RowType', writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records with a predefined schema into Orc files in a batch fashion. :param row_type: The RowType of records, it should match the RowTypeInfo of Row records. :param writer_properties: Orc writer options. :param hadoop_config: Hadoop configuration. """ from pyflink.table.types import RowType if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') from pyflink.table.types import _to_java_data_type j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()] ) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types ), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config) ), row_type )