001 /*
002 * APFormatExporter.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 26/Oct/2001
013 *
014 * $Id: APFormatExporter.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.creole;
018
019 import java.io.*;
020 import java.net.URL;
021 import java.util.*;
022
023 import gate.*;
024 import gate.util.*;
025
026 /** This class implements a APF xml exporter. It works on documents or corpora
027 * to export them in the APF format.
028 */
029 public class APFormatExporter extends AbstractLanguageAnalyser
030 implements ANNIEConstants{
031 public static final String
032 APF_EXP_DOCUMENT_PARAMETER_NAME = "document";
033
034 public static final String
035 APF_EXP_SOURCE_PARAMETER_NAME = "source";
036
037 public static final String
038 APF_EXP_DTD_PARAMETER_NAME = "dtdFileName";
039
040 public static final String
041 APF_EXP_PATH_PARAMETER_NAME = "exportFilePath";
042
043 public static final String
044 APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes";
045
046 public static final String
047 APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten";
048
049 /** Debug flag */
050 private static final boolean DEBUG = false;
051 /** Constructor does nothing. This PR is bean like initialized*/
052 public APFormatExporter() {}
053
054 /** Run the resource and does the entire export process*/
055 public void execute() throws ExecutionException{
056 // Check if the thing can be run
057 if(document == null)
058 throw new ExecutionException("No document found to export in APF format!");
059 if (exportedTypes == null)
060 throw new ExecutionException("No export types found.");
061 xmlDoc = new StringBuffer(10*(document.getContent().size().intValue()));
062 initDocId();
063 if (docId == null)
064 throw new ExecutionException("Couldn't detect the document's ID");
065 if (DEBUG)
066 Out.prln("Document id = "+ docId);
067
068 String exportFilePathStr = null;
069 if (exportFilePath == null)
070 exportFilePathStr = new String(document.getSourceUrl().getFile() +
071 ".apf.xml");
072 else
073 exportFilePathStr = exportFilePath.getPath()+ "/"
074 + gate.util.Files.getLastPathComponent(
075 document.getSourceUrl().getFile()) + ".apf.xml";
076
077 if (DEBUG)
078 Out.prln("Export file path = "+ exportFilePathStr);
079 //*
080 // Prepare to write into the xmlFile
081 OutputStreamWriter writer = null;
082 try{
083 writer = new OutputStreamWriter(
084 new FileOutputStream(new File(exportFilePathStr)));
085
086 // Write (test the toXml() method)
087 // This Action is added only when a gate.Document is created.
088 // So, is Bor sure that the resource is a gate.Document
089 serializeDocumentToAPF();
090 writer.write(xmlDoc.toString());
091 writer.flush();
092 writer.close();
093 }catch (Exception e){
094 throw new ExecutionException(e);
095 }// End try
096 //*/
097 } // execute()
098
099
100 /** Initialise this resource, and returns it. */
101 public Resource init() throws ResourceInstantiationException {
102 return this;
103 } // init()
104
105 /** Java bean style mutator for exportedTypes */
106 public void setExportedTypes(List anExportedTypesList){
107 exportedTypes = anExportedTypesList;
108 }// setExportedTypes();
109
110 /** Java bean style accesor for exportedTypes */
111 public List getExportedTypes(){
112 return exportedTypes;
113 }// getExportedTypes()
114
115 /** Java bean style mutator for dtdFileName */
116 public void setDtdFileName(String aDtdFileName){
117 dtdFileName = aDtdFileName;
118 }// setDtdFileName();
119
120 /** Java bean style accesor for DtdFileName */
121 public String getDtdFileName(){
122 return dtdFileName;
123 }// getDtdFileName()
124
125 /** Java bean style mutator for exportFilePath */
126 public void setExportFilePath(URL anExportFilePath){
127 exportFilePath = anExportFilePath;
128 }// setExportFilePath();
129
130 /** Java bean style accesor for exportFilePath */
131 public URL getExportFilePath(){
132 return exportFilePath;
133 }// getDtdFileName()
134
135 /** Java bean style mutator for source */
136 public void setSource(String aSource){
137 source = aSource;
138 }// setSource();
139
140 /** Java bean style accesor for source */
141 public String getSource(){
142 return source;
143 }// getSource()
144
145 /** Java bean style accesor for isSourceWritten */
146 public Boolean getIsSourceWritten() {
147 return new Boolean(isSourceWritten);
148 }
149
150 /** Java bean style mutator for isSourceWritten */
151 public void setIsSourceWritten(Boolean aIsSourceWritten){
152 isSourceWritten = aIsSourceWritten.booleanValue();
153 }// setIsSourceWritten();
154
155
156
157 /** Initialises the docId with documents' file name without the complete path*/
158 private void initDocId(){
159 String fileName = "";
160 fileName = gate.util.Files.getLastPathComponent(
161 document.getSourceUrl().getFile());
162 // File name contains now the last token
163 if (DEBUG)
164 Out.prln("From initDocId, fileName ="+ fileName);
165 StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,".");
166 StringBuffer tmpDocId = new StringBuffer("");
167 while(fileNameTokenizer.hasMoreTokens()){
168 String token = (String)fileNameTokenizer.nextToken();
169 // We don't want to append the last token
170 if (fileNameTokenizer.hasMoreTokens())
171 tmpDocId.append(token + ".");
172 }// End while
173 // if tokenization had place
174 if (!"".equals(tmpDocId)){
175 // Remove the last dot
176 tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),"");
177 docId = tmpDocId.toString();
178 }// End if
179 }// initDocId()
180
181 /** Returns the xml document conforming to APF dtd.*/
182 protected void serializeDocumentToAPF(){
183 xmlDoc.append("<?xml version=\"1.0\" ?>\n");
184 xmlDoc.append("<!DOCTYPE source_file SYSTEM ");
185 if (dtdFileName == null)
186 xmlDoc.append("\"ace-rdc.v2.0.1.dtd\"");
187 else
188 xmlDoc.append("\""+dtdFileName+"\"");
189 xmlDoc.append(">\n");
190 xmlDoc.append("<source_file TYPE=\"text\"");
191 if (isSourceWritten) {
192 AnnotationSet docTypeAnns = document.getAnnotations(
193 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("DOCTYPE");
194 if (docTypeAnns == null || docTypeAnns.isEmpty())
195 xmlDoc.append(" SOURCE=\""+ source+ "\" ");
196 else {
197 Annotation docTypeAnn = (Annotation) docTypeAnns.iterator().next();
198 if (docTypeAnn.getFeatures().get("SOURCE") == null)
199 xmlDoc.append(" SOURCE=\""+ source+ "\" ");
200 else
201 xmlDoc.append(" SOURCE=\""+ docTypeAnn.getFeatures().get("SOURCE")+ "\" ");
202 }//if no doc type annotations
203 }
204 xmlDoc.append("VERSION=\"2.0\" URI=\"");
205 xmlDoc.append(docId);
206 xmlDoc.append("-lf\">\n");
207 xmlDoc.append(" <document DOCID=\"");
208 xmlDoc.append(docId + "\">\n");
209 serializeEntities();
210 xmlDoc.append(" </document>\n");
211 xmlDoc.append("</source_file>");
212 }// serializeDocumentToAPF()
213
214 /** Transforms all the entities from exportedTypes found in the GATE document
215 * into their xml representation
216 */
217 protected void serializeEntities(){
218 // If no types founded then simply return
219 if (exportedTypes == null || exportedTypes.isEmpty()) return;
220
221 Map entitiesMap = null;
222 if ( document.getFeatures() == null ||
223 document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null)
224 entitiesMap = new HashMap();
225 else
226 entitiesMap = (Map)document.getFeatures().
227 get(DOCUMENT_COREF_FEATURE_NAME);
228 Map namedAnnotSetMap = null;
229 if (document.getNamedAnnotationSets() == null)
230 namedAnnotSetMap = new HashMap();
231 else
232 namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets());
233 // Add the default annoattion set
234 namedAnnotSetMap.put(null,document.getAnnotations());
235 // The entities map is a map from annotation sets names to list of lists
236 // Each list element is composed from annotations refering the same entity
237 // All the entities that are in the exportedTypes need to be serialized.
238 Iterator exportedTypesIter = exportedTypes.iterator();
239 while(exportedTypesIter.hasNext()){
240 String entityType = (String)exportedTypesIter.next();
241 // Serialize all entities of type
242 // The keys in the entitesMap are annotation sets names. The null key
243 // designates the default annotation.
244 Set annotationSetNames = namedAnnotSetMap.keySet();
245 Iterator annotationSetNamesIter = annotationSetNames.iterator();
246 while (annotationSetNamesIter.hasNext()){
247 Object annotSetName = annotationSetNamesIter.next();
248 // This list contains entities found in the annotSetName
249 List entitiesList = (List) entitiesMap.get(annotSetName);
250 if (entitiesList == null) entitiesList = new ArrayList();
251 // This annotation set will contain all annotations of "entityType"
252 AnnotationSet annotSet = null;
253 Set serializationAnnotSet = null;
254 annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName);
255 if (annotSet == null || annotSet.get(entityType) == null) continue;
256 serializationAnnotSet = new HashSet(annotSet.get(entityType));
257 // All annotations from annotSet will be serialized as entities unless
258 // some of them are present in the entities map
259 // Now we are searching for the entityType in the entitiesMap and
260 // serialize it from there. After that, remove all annotations
261 // entityType present in entitiesMap from annotSet and serialize the
262 // remaining entities.
263 //Iterate through the entitiesList in searching for entityType
264 Iterator entitiesListIter = entitiesList.iterator();
265 while (entitiesListIter.hasNext()){
266 List entity = (List)entitiesListIter.next();
267 // We want now to accesate an annotation from the entity list to get
268 // its type and compare it with entityType
269 String theEntityType = new String("");
270 if (entity != null && !entity.isEmpty()){
271 Integer annotId = (Integer)entity.get(0);
272 Annotation a = (Annotation)annotSet.get(annotId);
273 if (a != null) theEntityType = a.getType();
274 }// End if
275 // The the types are equal then serialize the entities
276 if (theEntityType.equals(entityType)){
277 List ent = new ArrayList();
278 Iterator entityIter = entity.iterator();
279 while(entityIter.hasNext()){
280 Integer id = (Integer)entityIter.next();
281 ent.add(annotSet.get(id));
282 }// End while
283 serializeAnEntity(ent);
284 // Remove all annotation from entity that apear in annotSet
285 serializationAnnotSet.removeAll(ent);
286 }// End if
287 }// End while(entitiesListIter.hasNext())
288 // Serialize the remaining entities in annotSet
289 Iterator serializationAnnotSetIter = serializationAnnotSet.iterator();
290 while(serializationAnnotSetIter.hasNext()){
291 Annotation annotEntity = (Annotation) serializationAnnotSetIter.next();
292 List ent = new ArrayList();
293 ent.add(annotEntity);
294 serializeAnEntity(ent);
295 }// End while(annotSetIter.hasNext())
296 }// End while(entitiesKeysIter.hasNext())
297 }// End while(exportedTypesIter.hasNext())
298 }// serializeEntities()
299
300 /** Writes an entity in the xmlDoc conforming to APF standards.
301 * @param anEntity represents a list with annotations that refer the same
302 * entity. Those annotations were detected and constructed by the
303 * orthomatcher.
304 */
305 private void serializeAnEntity(List anEntity){
306 if (anEntity == null || anEntity.isEmpty()) return;
307 // Write the entities tags
308 xmlDoc.append(" <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n");
309 // We know for sure that the list is not empty (see above)
310 Annotation a = (Annotation) anEntity.get(0);
311 xmlDoc.append(" <entity_type GENERIC=\"FALSE\">" + a.getType().toUpperCase() +
312 "</entity_type>\n");
313 // Write the entities mentions
314 Iterator anEntityIter = anEntity.iterator();
315 while(anEntityIter.hasNext()){
316 Annotation ann = (Annotation)anEntityIter.next();
317 serializeAnEntityMention(ann);
318 }// End while(anEntityIter.hasNext())
319 // Write the entities attributes
320 xmlDoc.append(" <entity_attributes>\n");
321 anEntityIter = anEntity.iterator();
322 while(anEntityIter.hasNext()){
323 Annotation ann = (Annotation)anEntityIter.next();
324 serializeAnEntityAttributes(ann);
325 }// End while(anEntityIter.hasNext())
326 xmlDoc.append(" </entity_attributes>\n");
327 xmlDoc.append(" </entity>\n");
328 }// End serializeAnEntity();
329
330 /** This method serializes an entity mention from an Annotation*/
331 private void serializeAnEntityMention(Annotation ann){
332 if (ann == null) return;
333 String entityMentionType = "NAME";
334 String entityMentionRole = null;
335 String entityMentionReference = null;
336 String entityMentionGeneric = null;
337
338 FeatureMap fm = ann.getFeatures();
339 if (fm != null){
340 if( null != fm.get("ENTITY_MENTION_TYPE"))
341 entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE");
342
343 entityMentionRole = (String) fm.get("ROLE");
344 entityMentionReference = (String) fm.get("REFERENCE");
345 entityMentionGeneric = (String) fm.get("GENERIC");
346 }// End if
347 String str1 = (entityMentionRole == null)? "" :
348 ("ROLE=\"" + entityMentionRole + "\"");
349 String str2 = (entityMentionReference == null)? "" :
350 ("REFERENCE=\"" + entityMentionReference + "\"");
351 String str3 = (entityMentionGeneric == null)? "" :
352 ("GENERIC=\"" + entityMentionGeneric + "\"");
353
354 /* modified by Di - the new scorer needs a unique ID for each mention as well */
355
356 xmlDoc.append(" <entity_mention TYPE=\"" + entityMentionType+"\"" +
357 str1 + " " + str2 + " " + str3 + "ID=\"" + "M" + getNextMentionId() +"\">\n"
358 );
359
360 // extent
361 xmlDoc.append(" <extent>\n");
362 xmlDoc.append(" <charseq>\n");
363 try{
364 xmlDoc.append(" <!-- string = \"" +
365 document.getContent().getContent(ann.getStartNode().getOffset(),
366 ann.getEndNode().getOffset())+"\" -->\n");
367 }catch (InvalidOffsetException ioe){
368 Err.prln("APFormatExporter:Warning: Couldn't access text between"+
369 " offsets:" + ann.getStartNode().getOffset() + " and "+
370 ann.getEndNode().getOffset());
371 }// End try
372 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+
373 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
374 xmlDoc.append(" </charseq>\n");
375 xmlDoc.append(" </extent>\n");
376 // head
377 xmlDoc.append(" <head>\n");
378 xmlDoc.append(" <charseq>\n");
379 try{
380 xmlDoc.append(" <!-- string = \"" +
381 document.getContent().getContent(ann.getStartNode().getOffset(),
382 ann.getEndNode().getOffset())+"\" -->\n");
383 }catch (InvalidOffsetException ioe){
384 Err.prln("APFormatExporter:Warning: Couldn't access text between"+
385 " offsets:" + ann.getStartNode().getOffset() + " and "+
386 ann.getEndNode().getOffset());
387 }// End try
388 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+
389 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
390 xmlDoc.append(" </charseq>\n");
391 xmlDoc.append(" </head>\n");
392 xmlDoc.append(" </entity_mention>\n");
393 }//serializeAnEntityMention();
394
395 /** This method serializes an entity attribute from an Annotation*/
396 private void serializeAnEntityAttributes(Annotation ann){
397 if (ann == null) return;
398 boolean isAttribute = false;
399 if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE"))
400 ||
401 null == ann.getFeatures().get("ENTITY_MENTION_TYPE"))
402 isAttribute = true;
403 if (! isAttribute)
404 return;
405
406 // name
407 xmlDoc.append(" <name>\n");
408 xmlDoc.append(" <charseq>\n");
409 try{
410 xmlDoc.append(" <!-- string = \"" +
411 document.getContent().getContent(ann.getStartNode().getOffset(),
412 ann.getEndNode().getOffset())+"\" -->\n");
413 }catch (InvalidOffsetException ioe){
414 Err.prln("APFormatExporter:Warning: Couldn't access text between"+
415 " offsets:" + ann.getStartNode().getOffset() + " and "+
416 ann.getEndNode().getOffset());
417 }// End try
418 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+
419 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
420 xmlDoc.append(" </charseq>\n");
421 xmlDoc.append(" </name>\n");
422 }//serializeAnEntityMention();
423
424 /** Returns the next safe ID for an entity*/
425 private int getNextEntityId(){
426 return entityId ++;
427 }// getNextEntityId()
428
429 /** added by Di - returns the next safe ID for an entity mention */
430 private int getNextMentionId(){
431 return mentionId ++;
432 }
433
434
435 /** This list of strings represents the entities type that will be exported*/
436 private List exportedTypes = null;
437 /** This is the name of the dtd file. If it's not present no dtd would be
438 * written in the APF file.
439 */
440 private String dtdFileName = null;
441 /** This field represent the document id and it is used in generating the
442 * entities IDs. It is the file name of the document, without the extension
443 */
444 private String docId = null;
445
446 /** This field represent an unique entity ID generator*/
447 private int entityId = 1;
448
449 /** added by Di - this field represents a unique entity ID generator */
450 private int mentionId = 1;
451
452 /** This is the xmlDoc that will be created*/
453 private StringBuffer xmlDoc = null;
454
455 private URL exportFilePath = null;
456
457 /** The source attribute for source*/
458 private String source = null;
459
460 /** The source attribute for source*/
461 private boolean isSourceWritten = true;
462
463
464 }// APFormatExporter
|