001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.core;
023
024 import com.jamonapi.Monitor;
025 import com.jamonapi.MonitorFactory;
026 import opennlp.tools.util.Span;
027 import org.slf4j.Logger;
028 import org.slf4j.LoggerFactory;
029
030 import java.util.ArrayList;
031 import java.util.List;
032
033 /**
034 * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
035 */
036 public class POJOMaker {
037
038 private static Logger log = LoggerFactory.getLogger(POJOMaker.class);
039
040 public static List<SentencePOJO> makePOJOs(String text, Tokenizer tokenizer) {
041
042 List<SentencePOJO> ret = new ArrayList<SentencePOJO>();
043
044 Span[] sentences = tokenizer.detectSentences(text);
045 for (Span one : sentences) {
046 ret.add(makeOneSentencePOJO(text, one, tokenizer));
047 }
048 return ret;
049 }
050
051 protected static SentencePOJO makeOneSentencePOJO(String text, Span sentence, Tokenizer tokenizer) {
052
053 Monitor mon = MonitorFactory.getTimeMonitor("makeOneSentencePOJO").start();
054 //generate the sentence POJO
055 SentencePOJO sentencePOJO = new SentencePOJO();
056 sentencePOJO.setText(sentence.getCoveredText(text).toString());
057 sentencePOJO.setSpan(sentence);
058
059 List<WordPOJO> wordPOJOList = new ArrayList<WordPOJO>();
060 sentencePOJO.setWordPOJOs(wordPOJOList);
061
062
063 //the Tokenizer is not Thread-safe!
064 Span[] words = tokenizer.detectWords(sentencePOJO.getText());
065 for (Span one : words) {
066 WordPOJO wordPOJO = new WordPOJO();
067 wordPOJO.setText(one.getCoveredText(sentencePOJO.getText()).toString());
068 Span absolute = new Span(one, sentence.getStart());
069 wordPOJO.setSpan(absolute);
070
071 if (log.isTraceEnabled()) {
072 StringBuilder logging = new StringBuilder();
073 logging.append("\nword: " + one.getCoveredText(sentencePOJO.getText()));
074 logging.append("\nabsolute sentence position [start|end]: " + sentence.getStart() + "|" + sentence.getEnd());
075 logging.append("\nrelative word position in sentence [start|end]: " + one.getStart() + "|" + one.getEnd());
076 logging.append("\nabsolute word position [start|end]: " + absolute.getStart() + "|" + absolute.getEnd());
077 log.trace(logging.toString());
078 }
079
080 wordPOJOList.add(wordPOJO);
081 }
082 mon.stop();
083 log.debug("Finished creating POJOs of sentences (" + sentencePOJO.getWordPOJOs().size() + " words, " + mon.getLastValue() + " ms.): " + sentencePOJO.getText());
084 return sentencePOJO;
085 }
086
087 }
088