001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.core.impl;
023
024 import com.hp.hpl.jena.ontology.OntModel;
025 import com.jamonapi.Monitor;
026 import com.jamonapi.MonitorFactory;
027 import eu.lod2.nlp2rdf.schema.str.ContextHashBasedString;
028 import opennlp.tools.util.Span;
029 import org.apache.commons.codec.digest.DigestUtils;
030 import org.nlp2rdf.core.URIGenerator;
031 import org.nlp2rdf.core.util.URIGeneratorHelper;
032 import org.slf4j.Logger;
033 import org.slf4j.LoggerFactory;
034
035 import java.security.InvalidParameterException;
036 import java.util.HashSet;
037 import java.util.Set;
038 import java.util.StringTokenizer;
039
040 /**
041 * @author Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
042 * <p/>
043 * This class implements the NIF Context-hash URI recipe.
044 * Before creating URIs init should be called, because this sets the correct value for contextLength
045 * <p/>
046 * There is no reason, why this would be threaded, so it is not threadsafe
047 */
048 public class MD5Based extends AbstractURIGenerator implements URIGenerator {
049 private static Logger log = LoggerFactory.getLogger(MD5Based.class);
050 public static final String bra = "(";
051 public static final String ket = ")";
052
053 int firstCharLength = 20;
054 int contextLength = 0;
055 public static final String identifier = "hash";
056
057 @Override
058 public String getRecipeUri() {
059 return "http://nlp2rdf.lod2.eu/schema/string/ContextHashBasedString";
060 }
061
062 @Override
063 public void assignRecipeClass(String uri, OntModel model) {
064 ContextHashBasedString.create(uri, model);
065 }
066
067 @Override
068 public String makeUri(String prefix, String text, Span span) {
069
070 //the substring
071 String anchoredPart = span.getCoveredText(text).toString();
072
073 StringBuilder message = new StringBuilder();
074 //calculate the context boundaries
075 message.append(URIGeneratorHelper.getContextBefore(span, text, contextLength)).append(bra).append(anchoredPart).append(ket).append(URIGeneratorHelper.getContextAfter(span, text, contextLength));
076
077 String digest = DigestUtils.md5Hex(message.toString());
078 String firstChars = URIGeneratorHelper.getFirstCharacters(anchoredPart, firstCharLength);
079 StringBuilder uri = new StringBuilder();
080 uri.append(prefix);
081 uri.append(identifier).append("_");
082 uri.append(contextLength).append("_");
083 uri.append(anchoredPart.length()).append("_");
084 uri.append(digest).append("_");
085 uri.append(firstChars);
086
087 if (log.isTraceEnabled()) {
088 log.trace("Text (" + text.length() + " chars): " + text);
089 log.trace("Word (" + span.getCoveredText(text).length() + " chars): " + span.getCoveredText(text));
090 log.trace("Span: " + span.getStart() + "|" + span.getEnd());
091 //log.trace("Before|After: " + before + "|" + after);
092 log.trace("Context (" + contextLength + ") before: |" + URIGeneratorHelper.getContextBefore(span, text, contextLength));
093 log.trace("Context (" + contextLength + ") after: |" + URIGeneratorHelper.getContextAfter(span, text, contextLength) + "|");
094 log.trace("Message: |" + message.toString() + "|");
095 log.trace("URI: " + uri.toString());
096 }
097
098 return uri.toString();
099 }
100
101
102 @Override
103 public void init(String text, Set<Span> spans) {
104 Monitor mon = MonitorFactory.getTimeMonitor(this.getClass().getSimpleName() + "init").start();
105 repeat(text, spans);
106 log.info("Optimal context calculated: " + contextLength + " needed: " + mon.stop().getLastValue() + " ms. ");
107 }
108
109 private void repeat(String text, Set<Span> spans) {
110 Set<String> collision = new HashSet<String>();
111 for (Span span : spans) {
112 if (false == collision.add(makeUri("", text, span))) {
113 contextLength++;
114 repeat(text, spans);
115 return;
116 }
117 }
118 }
119
120
121 @Override
122 public Span getSpanFor(String prefix, String uri, String text, String anchoredPart) {
123 StringTokenizer st = new StringTokenizer(uri.substring(prefix.length()), "_");
124 if (!(st.nextToken().equalsIgnoreCase(identifier))) {
125 throw new InvalidParameterException("The span could not be recognized correctly: " + uri + " with prefix " + prefix);
126 }
127
128 int contextLength = Integer.parseInt(st.nextToken());
129 int anchoredPartLength = Integer.parseInt(st.nextToken());
130 String hash = st.nextToken();
131
132 if (true) {
133 throw new RuntimeException("getSpan is not yet implemented for hash based uris");
134 }
135 return new Span(0, 0);
136 }
137
138
139 /*
140 // the uri has been used in the same text already
141 if (false == collision.add(uri)) {
142 // if the context covers the whole text there is no sense in expanding anything
143 if (before == 0 && after == text.length()) {
144 log.warn("A non-unique String URI was discovered: " + uri + ". Anchored part was: " + anchoredPart + ". This normally only happens, because the code calling this object uses the same parameters for a second time.");
145 return uri;
146
147 } else {
148 //make the context bigger, this will guarantee uniqueness
149 contextLength++;
150 throw new StartOverException("found a duplicate URI (hash collision), increasing context to: " + contextLength);
151 }
152 } */
153
154 }