public class Metadata extends Object
ParquetReaderConfig
. All the creation of
parquet metadata cache using create api's are forced to happen using the process user since only that user will have
write permission for the cache fileModifier and Type | Field and Description |
---|---|
static String[] |
CURRENT_METADATA_FILENAMES |
static Long |
DEFAULT_NULL_COUNT |
static String |
METADATA_DIRECTORIES_FILENAME |
static String |
METADATA_FILENAME |
static String |
METADATA_SUMMARY_FILENAME |
static Long |
NULL_COUNT_NOT_EXISTS |
static String |
OLD_METADATA_FILENAME |
static String[] |
OLD_METADATA_FILENAMES |
Modifier and Type | Method and Description |
---|---|
static void |
createMeta(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path,
ParquetReaderConfig readerConfig,
boolean allColumnsInteresting,
Set<SchemaPath> columnSet)
Create the parquet metadata file for the directory at the given path, and for any subdirectories.
|
static org.apache.hadoop.fs.Path |
getDirFileName(org.apache.hadoop.fs.Path metadataParentDir) |
static Metadata_V4.ParquetFileAndRowCountMetadata |
getParquetFileMetadata_v4(Metadata_V4.ParquetTableMetadata_v4 parquetTableMetadata,
org.apache.parquet.hadoop.metadata.ParquetMetadata footer,
org.apache.hadoop.fs.FileStatus file,
org.apache.hadoop.fs.FileSystem fs,
boolean allColumnsInteresting,
boolean skipNonInteresting,
Set<SchemaPath> columnSet,
ParquetReaderConfig readerConfig)
Get the file metadata for a single file
|
static Metadata_V4.ParquetTableMetadata_v4 |
getParquetTableMetadata(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path,
ParquetReaderConfig readerConfig)
Get the parquet metadata for the parquet files in the given directory, including those in subdirectories.
|
static Metadata_V4.ParquetTableMetadata_v4 |
getParquetTableMetadata(Map<org.apache.hadoop.fs.FileStatus,org.apache.hadoop.fs.FileSystem> fileStatusMap,
ParquetReaderConfig readerConfig)
Get the parquet metadata for a list of parquet files.
|
static Metadata_V4.MetadataSummary |
getSummary(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path metadataParentDir,
boolean autoRefreshTriggered,
ParquetReaderConfig readerConfig)
Reads the summary from the metadata cache file, if the cache file is stale recreates the metadata
|
static org.apache.hadoop.fs.Path |
getSummaryFileName(org.apache.hadoop.fs.Path metadataParentDir) |
static MetadataBase.ParquetTableMetadataBase |
readBlockMeta(org.apache.hadoop.fs.FileSystem fs,
List<org.apache.hadoop.fs.Path> paths,
MetadataContext metaContext,
ParquetReaderConfig readerConfig)
Get the parquet metadata for the table by reading the metadata file
|
static ParquetTableMetadataDirs |
readMetadataDirs(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path,
MetadataContext metaContext,
ParquetReaderConfig readerConfig)
Get the parquet metadata for all subdirectories by reading the metadata file
|
public static final String[] OLD_METADATA_FILENAMES
public static final String OLD_METADATA_FILENAME
public static final String METADATA_DIRECTORIES_FILENAME
public static final String METADATA_FILENAME
public static final String METADATA_SUMMARY_FILENAME
public static final String[] CURRENT_METADATA_FILENAMES
public static final Long DEFAULT_NULL_COUNT
public static final Long NULL_COUNT_NOT_EXISTS
public static void createMeta(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path, ParquetReaderConfig readerConfig, boolean allColumnsInteresting, Set<SchemaPath> columnSet) throws IOException
fs
- file systempath
- pathreaderConfig
- parquet reader configurationallColumnsInteresting
- if set, store column metadata for all the columnscolumnSet
- Set of columns for which column metadata has to be storedIOException
public static Metadata_V4.ParquetTableMetadata_v4 getParquetTableMetadata(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path, ParquetReaderConfig readerConfig) throws IOException
fs
- file systempath
- pathreaderConfig
- parquet reader configurationIOException
public static Metadata_V4.ParquetTableMetadata_v4 getParquetTableMetadata(Map<org.apache.hadoop.fs.FileStatus,org.apache.hadoop.fs.FileSystem> fileStatusMap, ParquetReaderConfig readerConfig) throws IOException
fileStatusMap
- file statuses and corresponding file systemsreaderConfig
- parquet reader configurationIOException
public static MetadataBase.ParquetTableMetadataBase readBlockMeta(org.apache.hadoop.fs.FileSystem fs, List<org.apache.hadoop.fs.Path> paths, MetadataContext metaContext, ParquetReaderConfig readerConfig)
fs
- current file systempaths
- The path to the metadata file, located in the directory that contains the parquet filesmetaContext
- metadata contextreaderConfig
- parquet reader configurationpublic static ParquetTableMetadataDirs readMetadataDirs(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path, MetadataContext metaContext, ParquetReaderConfig readerConfig)
fs
- current file systempath
- The path to the metadata file, located in the directory that contains the parquet filesmetaContext
- metadata contextreaderConfig
- parquet reader configurationpublic static Metadata_V4.ParquetFileAndRowCountMetadata getParquetFileMetadata_v4(Metadata_V4.ParquetTableMetadata_v4 parquetTableMetadata, org.apache.parquet.hadoop.metadata.ParquetMetadata footer, org.apache.hadoop.fs.FileStatus file, org.apache.hadoop.fs.FileSystem fs, boolean allColumnsInteresting, boolean skipNonInteresting, Set<SchemaPath> columnSet, ParquetReaderConfig readerConfig) throws IOException, InterruptedException
parquetTableMetadata
- The table metadata to be updated with all the columns' infofooter
- If non null, use this footer instead of reading it from the filefile
- The fileallColumnsInteresting
- If true, read the min/max metadata for all the columnsskipNonInteresting
- If true, collect info only for the interesting columnscolumnSet
- Specifies specific columns for which min/max metadata is collectedreaderConfig
- for the optionsIOException
InterruptedException
public static org.apache.hadoop.fs.Path getSummaryFileName(org.apache.hadoop.fs.Path metadataParentDir)
public static org.apache.hadoop.fs.Path getDirFileName(org.apache.hadoop.fs.Path metadataParentDir)
public static Metadata_V4.MetadataSummary getSummary(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path metadataParentDir, boolean autoRefreshTriggered, ParquetReaderConfig readerConfig)
fs
- file systemmetadataParentDir
- parent directory that holds metadata filesautoRefreshTriggered
- true if the auto-refresh is already triggeredreaderConfig
- Parquet reader configCopyright © 1970 The Apache Software Foundation. All rights reserved.