|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||
java.lang.Objectgate.util.AbstractFeatureBearer
gate.creole.AbstractResource
gate.creole.AbstractProcessingResource
gate.creole.AbstractLanguageAnalyser
gate.creole.splitter.RegexSentenceSplitter
public class RegexSentenceSplitter
A fast sentence splitter replacement based on regular expressions.
| Nested Class Summary | |
|---|---|
private class |
RegexSentenceSplitter.MatchResultComparator
A comparator for MatchResult objects. |
| Nested classes/interfaces inherited from class gate.creole.AbstractProcessingResource |
|---|
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener |
| Field Summary | |
|---|---|
protected Document |
document
The document to be processed |
protected String |
encoding
Encoding used when reading config files |
protected URL |
externalSplitListURL
URL pointing to a file with regex patterns for external sentence splits. |
protected Pattern |
externalSplitsPattern
|
protected URL |
internalSplitListURL
URL pointing to a file with regex patterns for internal sentence splits. |
protected Pattern |
internalSplitsPattern
|
protected URL |
nonSplitListURL
URL pointing to a file with regex patterns for non sentence splits. |
protected Pattern |
nonSplitsPattern
|
protected String |
outputASName
Output annotation set name. |
private static long |
serialVersionUID
serialisation ID |
static String |
SPLIT_DOCUMENT_PARAMETER_NAME
Parameter name |
static String |
SPLIT_ENCODING_PARAMETER_NAME
Parameter name |
static String |
SPLIT_INPUT_AS_PARAMETER_NAME
Parameter name |
static String |
SPLIT_NON_SPLIT_LIST_PARAMETER_NAME
Parameter name |
static String |
SPLIT_OUTPUT_AS_PARAMETER_NAME
Parameter name |
static String |
SPLIT_SPLIT_LIST_PARAMETER_NAME
Parameter name |
| Fields inherited from class gate.creole.AbstractLanguageAnalyser |
|---|
corpus |
| Fields inherited from class gate.creole.AbstractProcessingResource |
|---|
interrupted |
| Fields inherited from class gate.creole.AbstractResource |
|---|
name |
| Fields inherited from class gate.util.AbstractFeatureBearer |
|---|
features |
| Constructor Summary | |
|---|---|
RegexSentenceSplitter()
|
|
| Method Summary | |
|---|---|
protected Pattern |
compilePattern(URL paternsListUrl,
String encoding)
|
void |
execute()
Run the resource. |
Document |
getDocument()
Get the document property for this analyser. |
String |
getEncoding()
|
URL |
getExternalSplitListURL()
|
URL |
getInternalSplitListURL()
|
Pattern |
getInternalSplitsPattern()
|
URL |
getNonSplitListURL()
|
String |
getOutputASName()
|
Resource |
init()
Initialise this resource, and return it. |
void |
setDocument(Document document)
Set the document property for this analyser. |
void |
setEncoding(String encoding)
|
void |
setExternalSplitListURL(URL externalSplitListURL)
|
void |
setInternalSplitListURL(URL internalSplitListURL)
|
void |
setInternalSplitsPattern(Pattern internalSplitsPattern)
|
void |
setNonSplitListURL(URL nonSplitListURL)
|
void |
setOutputASName(String outputASName)
|
private boolean |
veto(MatchResult split,
List<int[]> vetoRegions)
Checks whether a possible match is being vetoed by a non split match. |
| Methods inherited from class gate.creole.AbstractLanguageAnalyser |
|---|
getCorpus, setCorpus |
| Methods inherited from class gate.creole.AbstractProcessingResource |
|---|
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener |
| Methods inherited from class gate.creole.AbstractResource |
|---|
checkParameterValues, getBeanInfo, getName, getParameterValue, getParameterValue, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners |
| Methods inherited from class gate.util.AbstractFeatureBearer |
|---|
getFeatures, setFeatures |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Methods inherited from interface gate.ProcessingResource |
|---|
reInit |
| Methods inherited from interface gate.Resource |
|---|
cleanup, getParameterValue, setParameterValue, setParameterValues |
| Methods inherited from interface gate.util.FeatureBearer |
|---|
getFeatures, setFeatures |
| Methods inherited from interface gate.util.NameBearer |
|---|
getName, setName |
| Methods inherited from interface gate.Executable |
|---|
interrupt, isInterrupted |
| Field Detail |
|---|
public static final String SPLIT_DOCUMENT_PARAMETER_NAME
public static final String SPLIT_INPUT_AS_PARAMETER_NAME
public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME
public static final String SPLIT_ENCODING_PARAMETER_NAME
public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME
public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME
private static final long serialVersionUID
protected Document document
protected String outputASName
protected String encoding
protected URL internalSplitListURL
protected URL externalSplitListURL
protected URL nonSplitListURL
protected Pattern internalSplitsPattern
protected Pattern externalSplitsPattern
protected Pattern nonSplitsPattern
| Constructor Detail |
|---|
public RegexSentenceSplitter()
| Method Detail |
|---|
protected Pattern compilePattern(URL paternsListUrl,
String encoding)
throws UnsupportedEncodingException,
IOException
UnsupportedEncodingException
IOException
public void execute()
throws ExecutionException
AbstractProcessingResource
execute in interface Executableexecute in class AbstractProcessingResourceExecutionException
private boolean veto(MatchResult split,
List<int[]> vetoRegions)
split - the match result representing the split to be testedvetoRegions - regions where matches are not allowed. For efficiency
reasons, this method assumes these regions to be non overlapping and sorted
in ascending order.
All veto regions that end before the proposed match are also discarded
(again for efficiency reasons). This requires the proposed matches to be
sent to this method in ascending order, so as to avoid malfunctions.
public Resource init()
throws ResourceInstantiationException
AbstractProcessingResource
init in interface Resourceinit in class AbstractProcessingResourceResourceInstantiationExceptionpublic Document getDocument()
AbstractLanguageAnalyser
getDocument in interface LanguageAnalysergetDocument in class AbstractLanguageAnalyserpublic void setDocument(Document document)
AbstractLanguageAnalyser
setDocument in interface LanguageAnalysersetDocument in class AbstractLanguageAnalyserdocument - the document to setpublic String getOutputASName()
public void setOutputASName(String outputASName)
outputASName - the outputASName to setpublic String getEncoding()
public void setEncoding(String encoding)
encoding - the encoding to setpublic URL getInternalSplitListURL()
public void setInternalSplitListURL(URL internalSplitListURL)
internalSplitListURL - the internalSplitListURL to setpublic URL getExternalSplitListURL()
public void setExternalSplitListURL(URL externalSplitListURL)
externalSplitListURL - the externalSplitListURL to setpublic URL getNonSplitListURL()
public void setNonSplitListURL(URL nonSplitListURL)
nonSplitListURL - the nonSplitListURL to setpublic Pattern getInternalSplitsPattern()
public void setInternalSplitsPattern(Pattern internalSplitsPattern)
internalSplitsPattern - the internalSplitsPattern to set
|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||