T - public class RddRdfSaver<T> extends Object
| Modifier and Type | Field and Description |
|---|---|
protected boolean |
allowOverwriteFiles |
protected Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad>> |
convertToQuad |
protected Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple>> |
convertToTriple |
protected boolean |
deletePartitionFolderAfterMerge |
protected org.apache.jena.shared.PrefixMapping |
globalPrefixMapping |
protected org.apache.hadoop.conf.Configuration |
hadoopConfiguration |
protected boolean |
mapQuadsToTriplesForTripleLangs
Whether to convert quads to triples if a triple-based output format is requested
|
protected org.apache.jena.riot.RDFFormat |
outputFormat |
protected org.apache.hadoop.fs.Path |
partitionFolder |
protected boolean |
partitionsAsIndependentFiles |
protected org.apache.spark.api.java.JavaRDD<T> |
rdd |
protected BiConsumer<T,org.apache.jena.riot.system.StreamRDF> |
sendRecordToStreamRDF |
protected org.apache.spark.api.java.JavaSparkContext |
sparkContext |
protected org.apache.hadoop.fs.Path |
targetFile |
protected boolean |
useCoalesceOne |
protected boolean |
useElephas |
| Constructor and Description |
|---|
RddRdfSaver(org.apache.spark.api.java.JavaRDD<T> rdd,
BiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF,
Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple>> convertToTriple,
Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad>> convertToQuad) |
| Modifier and Type | Method and Description |
|---|---|
static <T> RddRdfSaver |
create(org.apache.spark.api.java.JavaRDD<T> rdd,
org.aksw.commons.lambda.serializable.SerializableBiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF,
Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple>> convertToTriple,
Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad>> convertToQuad)
Create method.
|
static RddRdfSaver<org.apache.jena.query.Dataset> |
createForDataset(org.apache.spark.api.java.JavaRDD<org.apache.jena.query.Dataset> rdd) |
static RddRdfSaver<org.apache.jena.sparql.core.Quad> |
createForQuad(org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad> rdd) |
static RddRdfSaver<org.apache.jena.graph.Triple> |
createForTriple(org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple> rdd) |
static Function<OutputStream,org.apache.jena.riot.system.StreamRDF> |
createStreamRDFFactory(org.apache.jena.riot.RDFFormat rdfFormat,
boolean mapQuadsToTriplesForTripleLangs,
org.apache.jena.shared.PrefixMapping prefixMapping)
Create a function that can create a StreamRDF instance that is backed by the given
OutputStream.
|
org.apache.jena.shared.PrefixMapping |
getGlobalPrefixMapping() |
org.apache.jena.riot.RDFFormat |
getOutputFormat() |
org.apache.hadoop.fs.Path |
getPartitionFolder() |
org.apache.hadoop.fs.Path |
getTargetFile() |
boolean |
isAllowOverwriteFiles() |
boolean |
isDeletePartitionFolderAfterMerge() |
boolean |
isMapQuadsToTriplesForTripleLangs() |
boolean |
isPartitionsAsIndependentFiles() |
boolean |
isUseCoalesceOne() |
boolean |
isUseElephas() |
static void |
mergeFolder(Path outFile,
Path srcFolder,
String pattern,
Comparator<? super Path> pathComparator) |
RddRdfSaver<T> |
mutate(Consumer<RddRdfSaver<T>> action)
Pass this object to a consumer.
|
static Iterator<String> |
partitionMapperNQuads(Iterator<org.apache.jena.sparql.core.Quad> it) |
static Iterator<String> |
partitionMapperNTriples(Iterator<org.apache.jena.graph.Triple> it)
Save the RDD to a single file.
|
static <T> org.aksw.commons.lambda.throwing.ThrowingFunction<Iterator<T>,Iterator<String>> |
partitionMapperRDFStream(Function<OutputStream,org.apache.jena.riot.system.StreamRDF> streamRDFFactory,
BiConsumer<? super T,org.apache.jena.riot.system.StreamRDF> sendRecordToWriter) |
void |
run()
Run the save action according to configuration
|
static <T> void |
saveToFolder(org.apache.spark.api.java.JavaRDD<T> javaRdd,
String path,
org.apache.jena.riot.RDFFormat rdfFormat,
boolean mapQuadsToTriplesForTripleLangs,
org.apache.jena.shared.PrefixMapping globalPrefixMapping,
BiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF)
Save the data in Trig/Turtle or its sub-formats (n-quads/n-triples) format.
|
static <T> void |
saveUsingElephas(org.apache.spark.api.java.JavaRDD<T> rdd,
org.apache.hadoop.fs.Path path,
org.apache.jena.riot.Lang lang,
org.aksw.commons.lambda.serializable.SerializableFunction<? super T,?> recordToWritable) |
RddRdfSaver<T> |
setAllowOverwriteFiles(boolean allowOverwriteFiles) |
RddRdfSaver<T> |
setDeletePartitionFolderAfterMerge(boolean deletePartitionFolderAfterMerge) |
RddRdfSaver<T> |
setGlobalPrefixMapping(org.apache.jena.shared.PrefixMapping globalPrefixMapping)
Set a prefix mapping to be used "globally" across all partitions.
|
RddRdfSaver |
setMapQuadsToTriplesForTripleLangs(boolean mapQuadsToTriplesForTripleLangs)
Whether to convert quads to triples if a triple-based output format is requested
Jena by default discards any quad outside of the default graph when writing to a triple format.
|
RddRdfSaver<T> |
setOutputFormat(org.apache.jena.riot.RDFFormat outputFormat) |
RddRdfSaver<T> |
setPartitionFolder(org.apache.hadoop.fs.Path partitionFolder) |
RddRdfSaver<T> |
setPartitionFolder(String partitionFolder) |
RddRdfSaver<T> |
setPartitionsAsIndependentFiles(boolean partitionsAsIndependentFiles) |
RddRdfSaver<T> |
setTargetFile(org.apache.hadoop.fs.Path targetFile) |
RddRdfSaver<T> |
setTargetFile(String targetFile) |
void |
setUseCoalesceOne(boolean useCoalesceOne) |
RddRdfSaver<T> |
setUseElephas(boolean useElephas) |
static String |
toString(org.apache.jena.shared.PrefixMapping prefixMapping,
org.apache.jena.riot.RDFFormat rdfFormat)
Convert a prefix mapping to a string
|
static void |
validateOutFolder(org.apache.hadoop.fs.Path path,
org.apache.hadoop.conf.Configuration conf,
boolean deleteIfExists) |
protected org.apache.spark.api.java.JavaRDD<T> rdd
protected org.apache.hadoop.fs.Path partitionFolder
protected org.apache.hadoop.fs.Path targetFile
protected boolean useCoalesceOne
protected boolean deletePartitionFolderAfterMerge
protected org.apache.jena.shared.PrefixMapping globalPrefixMapping
protected org.apache.jena.riot.RDFFormat outputFormat
protected boolean allowOverwriteFiles
protected boolean useElephas
protected boolean partitionsAsIndependentFiles
protected boolean mapQuadsToTriplesForTripleLangs
protected BiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF
protected Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple>> convertToTriple
protected Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad>> convertToQuad
protected org.apache.spark.api.java.JavaSparkContext sparkContext
protected org.apache.hadoop.conf.Configuration hadoopConfiguration
public RddRdfSaver(org.apache.spark.api.java.JavaRDD<T> rdd, BiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF, Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple>> convertToTriple, Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad>> convertToQuad)
public boolean isMapQuadsToTriplesForTripleLangs()
public RddRdfSaver setMapQuadsToTriplesForTripleLangs(boolean mapQuadsToTriplesForTripleLangs)
public boolean isUseCoalesceOne()
public void setUseCoalesceOne(boolean useCoalesceOne)
public boolean isDeletePartitionFolderAfterMerge()
public RddRdfSaver<T> setDeletePartitionFolderAfterMerge(boolean deletePartitionFolderAfterMerge)
public org.apache.jena.shared.PrefixMapping getGlobalPrefixMapping()
public org.apache.hadoop.fs.Path getPartitionFolder()
public RddRdfSaver<T> setPartitionFolder(org.apache.hadoop.fs.Path partitionFolder)
public RddRdfSaver<T> setPartitionFolder(String partitionFolder)
public org.apache.hadoop.fs.Path getTargetFile()
public RddRdfSaver<T> setTargetFile(org.apache.hadoop.fs.Path targetFile)
public RddRdfSaver<T> setTargetFile(String targetFile)
public RddRdfSaver<T> setGlobalPrefixMapping(org.apache.jena.shared.PrefixMapping globalPrefixMapping)
globalPrefixMapping - public org.apache.jena.riot.RDFFormat getOutputFormat()
public RddRdfSaver<T> setOutputFormat(org.apache.jena.riot.RDFFormat outputFormat)
public boolean isAllowOverwriteFiles()
public RddRdfSaver<T> setAllowOverwriteFiles(boolean allowOverwriteFiles)
public boolean isUseElephas()
public RddRdfSaver<T> setUseElephas(boolean useElephas)
public boolean isPartitionsAsIndependentFiles()
public RddRdfSaver<T> setPartitionsAsIndependentFiles(boolean partitionsAsIndependentFiles)
public RddRdfSaver<T> mutate(Consumer<RddRdfSaver<T>> action)
rdd.configureSave().mutate(self -> { if (condition) { self.setX(); }}).run();
action - public void run()
throws IOException
IOExceptionpublic static void validateOutFolder(org.apache.hadoop.fs.Path path,
org.apache.hadoop.conf.Configuration conf,
boolean deleteIfExists)
throws IOException
IOExceptionpublic static String toString(org.apache.jena.shared.PrefixMapping prefixMapping, org.apache.jena.riot.RDFFormat rdfFormat)
public static void mergeFolder(Path outFile, Path srcFolder, String pattern, Comparator<? super Path> pathComparator) throws IOException
IOExceptionpublic static Iterator<String> partitionMapperNTriples(Iterator<org.apache.jena.graph.Triple> it)
mode - exitOnError - /
def saveToFile(outFile: String,
prefixMapping: PrefixMapping,
rdfFormat: RDFFormat,
outFolder: String,
mode: io.SaveMode.Value = SaveMode.ErrorIfExists,
exitOnError: Boolean = false): Unit = {
val outFilePath = Paths.get(outFile).toAbsolutePath val outFileFileName = outFilePath.getFileName.toString val outFolderPath = if (outFolder == null) outFilePath.resolveSibling(outFileFileName + "-parts") else Paths.get(outFolder).toAbsolutePath
saveToFolder(outFolderPath.toString, prefixMapping, rdfFormat, mode, exitOnError) mergeFolder(outFilePath, outFolderPath, "part*") }
public static Iterator<String> partitionMapperNQuads(Iterator<org.apache.jena.sparql.core.Quad> it)
public static Function<OutputStream,org.apache.jena.riot.system.StreamRDF> createStreamRDFFactory(org.apache.jena.riot.RDFFormat rdfFormat, boolean mapQuadsToTriplesForTripleLangs, org.apache.jena.shared.PrefixMapping prefixMapping)
rdfFormat - prefixMapping - public static <T> org.aksw.commons.lambda.throwing.ThrowingFunction<Iterator<T>,Iterator<String>> partitionMapperRDFStream(Function<OutputStream,org.apache.jena.riot.system.StreamRDF> streamRDFFactory, BiConsumer<? super T,org.apache.jena.riot.system.StreamRDF> sendRecordToWriter)
public static <T> void saveToFolder(org.apache.spark.api.java.JavaRDD<T> javaRdd,
String path,
org.apache.jena.riot.RDFFormat rdfFormat,
boolean mapQuadsToTriplesForTripleLangs,
org.apache.jena.shared.PrefixMapping globalPrefixMapping,
BiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF)
throws IOException
path - the folder into which the file(s) will be written tomode - the expected behavior of saving the data to a data sourceIOExceptionpublic static <T> void saveUsingElephas(org.apache.spark.api.java.JavaRDD<T> rdd,
org.apache.hadoop.fs.Path path,
org.apache.jena.riot.Lang lang,
org.aksw.commons.lambda.serializable.SerializableFunction<? super T,?> recordToWritable)
public static <T> RddRdfSaver create(org.apache.spark.api.java.JavaRDD<T> rdd, org.aksw.commons.lambda.serializable.SerializableBiConsumer<T,org.apache.jena.riot.system.StreamRDF> sendRecordToStreamRDF, Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple>> convertToTriple, Function<org.apache.spark.api.java.JavaRDD<T>,org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad>> convertToQuad)
T - rdd - sendRecordToStreamRDF - convertToTriple - convertToQuad - public static RddRdfSaver<org.apache.jena.graph.Triple> createForTriple(org.apache.spark.api.java.JavaRDD<org.apache.jena.graph.Triple> rdd)
public static RddRdfSaver<org.apache.jena.sparql.core.Quad> createForQuad(org.apache.spark.api.java.JavaRDD<org.apache.jena.sparql.core.Quad> rdd)
public static RddRdfSaver<org.apache.jena.query.Dataset> createForDataset(org.apache.spark.api.java.JavaRDD<org.apache.jena.query.Dataset> rdd)
Copyright © 2016–2021 Smart Data Analytics (SDA) Research Group. All rights reserved.