public class WikipediaDumpReader
extends org.dice_research.topicmodeling.preprocessing.docsupplier.AbstractDocumentSupplier
implements org.dice_research.topicmodeling.io.xml.XMLParserObserver
| Modifier and Type | Field and Description |
|---|---|
private static String |
ARTICLE_ID_XML_TAG_NAME |
private String |
data |
private org.dice_research.topicmodeling.utils.doc.Document |
document |
private static String |
DOCUMENT_XML_TAG_NAME |
private Reader |
dumpReader |
private String |
lastIdContainingTag |
private static org.slf4j.Logger |
LOGGER |
private static String |
NAMESPACE_XML_TAG_NAME |
private static String |
REDIRECT_XML_TAG_NAME |
private static String |
REVISION_XML_TAG_NAME |
private static String |
TEXT_XML_TAG_NAME |
private static String |
TITLE_XML_TAG_NAME |
private org.dice_research.topicmodeling.io.xml.stream.SimpleReaderBasedXMLParser |
xmlParser |
| Modifier | Constructor and Description |
|---|---|
private |
WikipediaDumpReader(Reader reader) |
| Modifier and Type | Method and Description |
|---|---|
static WikipediaDumpReader |
createReader(File file) |
static WikipediaDumpReader |
createReader(File file,
Charset charset) |
static WikipediaDumpReader |
createReader(InputStream input,
Charset charset) |
static WikipediaDumpReader |
createReader(String filename) |
static WikipediaDumpReader |
createReader(String filename,
Charset charset) |
org.dice_research.topicmodeling.utils.doc.Document |
getNextDocument() |
void |
handleClosingTag(String tagString) |
void |
handleData(String data) |
void |
handleEmptyTag(String tagString) |
void |
handleOpeningTag(String tagString) |
getNextDocumentId, setDocumentStartIdprivate static final org.slf4j.Logger LOGGER
private static final String DOCUMENT_XML_TAG_NAME
private static final String TITLE_XML_TAG_NAME
private static final String TEXT_XML_TAG_NAME
private static final String ARTICLE_ID_XML_TAG_NAME
private static final String REVISION_XML_TAG_NAME
private static final String REDIRECT_XML_TAG_NAME
private static final String NAMESPACE_XML_TAG_NAME
private Reader dumpReader
private org.dice_research.topicmodeling.io.xml.stream.SimpleReaderBasedXMLParser xmlParser
private String data
private org.dice_research.topicmodeling.utils.doc.Document document
private String lastIdContainingTag
private WikipediaDumpReader(Reader reader)
public static WikipediaDumpReader createReader(String filename) throws FileNotFoundException
FileNotFoundExceptionpublic static WikipediaDumpReader createReader(String filename, Charset charset) throws FileNotFoundException
FileNotFoundExceptionpublic static WikipediaDumpReader createReader(File file) throws FileNotFoundException
FileNotFoundExceptionpublic static WikipediaDumpReader createReader(File file, Charset charset) throws FileNotFoundException
FileNotFoundExceptionpublic static WikipediaDumpReader createReader(InputStream input, Charset charset)
public org.dice_research.topicmodeling.utils.doc.Document getNextDocument()
getNextDocument in interface org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplierpublic void handleOpeningTag(String tagString)
handleOpeningTag in interface org.dice_research.topicmodeling.io.xml.XMLParserObserverpublic void handleClosingTag(String tagString)
handleClosingTag in interface org.dice_research.topicmodeling.io.xml.XMLParserObserverpublic void handleData(String data)
handleData in interface org.dice_research.topicmodeling.io.xml.XMLParserObserverpublic void handleEmptyTag(String tagString)
handleEmptyTag in interface org.dice_research.topicmodeling.io.xml.XMLParserObserverCopyright © 2015–2020. All rights reserved.