001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.core.impl;
023
024 import com.hp.hpl.jena.ontology.OntModel;
025 import com.hp.hpl.jena.rdf.model.Model;
026 import com.jamonapi.Monitor;
027 import com.jamonapi.MonitorFactory;
028 import eu.lod2.nlp2rdf.schema.str.ContextHashBasedString;
029 import org.apache.commons.codec.digest.DigestUtils;
030 import org.nlp2rdf.core.Span;
031 import org.nlp2rdf.core.URIGenerator;
032 import org.nlp2rdf.core.util.URIGeneratorHelper;
033 import org.slf4j.Logger;
034 import org.slf4j.LoggerFactory;
035
036 import java.security.InvalidParameterException;
037 import java.util.HashSet;
038 import java.util.Set;
039 import java.util.StringTokenizer;
040
041 /**
042 * @author Sebastian Hellmann
043 * <p/>
044 * <p/>
045 * This class implements the NIF Context-Hash URI Scheme.
046 * http://nlp2rdf.org/nif-1-0#toc-nif-recipe-context-hash-based-uris
047 * The initial contextLength is set to 10
048 * <p/>
049 * <p/>
050 * To change this either call init(), which calculates the required minimal contextlength for all uris to be unique for this document.
051 * or
052 * use the constructor
053 * <p/>
054 * There is no reason, why this would be threaded, so it is not threadsafe
055 */
056 public class MD5Based extends AbstractURIGenerator implements URIGenerator {
057 private static Logger log = LoggerFactory.getLogger(MD5Based.class);
058 public static final String IDENTIFIER = "hash";
059 public static final String BRA = "(";
060 public static final String KET = ")";
061 protected int contextLength = 10;
062
063 public MD5Based() {
064 this(10);
065 }
066
067 public MD5Based(int contextLength) {
068 this.contextLength = contextLength;
069 }
070
071 public MD5Based(String text, Set<Span> allSpans) {
072 setMinimalContextLength(text, allSpans);
073 }
074
075 public MD5Based(String prefix, OntModel model) {
076 String delimiter = "_";
077 StringTokenizer st = new StringTokenizer(ContextHashBasedString.list(model).get(0).getURI().substring(prefix.length()), delimiter);
078 if (!(st.nextToken().equalsIgnoreCase(IDENTIFIER))) {
079 throw new InvalidParameterException("The span could not be recognized correctly: " + ContextHashBasedString.list(model).get(0) + " with prefix " + prefix);
080 }
081 contextLength = Integer.parseInt(st.nextToken());
082 }
083
084 @Override
085 public String getRecipeUri() {
086 return "http://nlp2rdf.lod2.eu/schema/string/ContextHashBasedString";
087 }
088
089 @Override
090 public void assignRecipeClass(String uri, OntModel model) {
091 ContextHashBasedString.create(uri, model);
092 }
093
094 @Override
095 public String makeUri(String prefix, String text, Span span) {
096
097 //the substring
098 String anchoredPart = span.getCoveredText(text).toString();
099
100 StringBuilder message = new StringBuilder();
101 //calculate the context boundaries
102 message.append(URIGeneratorHelper.getContextBefore(span, text, contextLength));
103 message.append(BRA);
104 message.append(anchoredPart);
105 message.append(KET);
106 message.append(URIGeneratorHelper.getContextAfter(span, text, contextLength));
107
108 String digest = DigestUtils.md5Hex(message.toString());
109 String firstChars = URIGeneratorHelper.getFirstCharacters(anchoredPart, firstCharLength);
110 StringBuilder uri = new StringBuilder();
111 uri.append(prefix);
112 uri.append(IDENTIFIER).append("_");
113 uri.append(contextLength).append("_");
114 uri.append(anchoredPart.length()).append("_");
115 uri.append(digest).append("_");
116 uri.append(firstChars);
117
118 if (log.isTraceEnabled()) {
119 log.trace("Text (" + text.length() + " chars): " + text);
120 log.trace("Word (" + span.getCoveredText(text).length() + " chars): " + span.getCoveredText(text));
121 log.trace("Span: " + span.getStart() + "|" + span.getEnd());
122 //log.trace("Before|After: " + before + "|" + after);
123 log.trace("Context (" + contextLength + ") before: |" + URIGeneratorHelper.getContextBefore(span, text, contextLength));
124 log.trace("Context (" + contextLength + ") after: |" + URIGeneratorHelper.getContextAfter(span, text, contextLength) + "|");
125 log.trace("Message: |" + message.toString() + "|");
126 log.trace("URI: " + uri.toString());
127 }
128
129 return uri.toString();
130 }
131
132
133 public void setMinimalContextLength(String text, Set<Span> spans) {
134 Monitor mon = MonitorFactory.getTimeMonitor(this.getClass().getSimpleName() + "init").start();
135 repeat(text, spans);
136 log.info("Minimal context calculated: " + contextLength + " needed: " + mon.stop().getLastValue() + " ms. ");
137 }
138
139 private void repeat(String text, Set<Span> allSpans) {
140 Set<String> collision = new HashSet<String>();
141 for (Span span : allSpans) {
142 if (false == collision.add(makeUri("", text, span))) {
143 contextLength++;
144 repeat(text, allSpans);
145 return;
146 }
147 }
148 }
149
150
151 @Override
152 public Span getSpanFor(String prefix, String uri, String text) {
153 String delimiter = "_";
154 StringTokenizer st = new StringTokenizer(uri.substring(prefix.length()), delimiter);
155 if (!(st.nextToken().equalsIgnoreCase(IDENTIFIER))) {
156 throw new InvalidParameterException("The span could not be recognized correctly: " + uri + " with prefix " + prefix);
157 }
158
159 int contextLength = Integer.parseInt(st.nextToken());
160 int anchoredPartLength = Integer.parseInt(st.nextToken());
161 String digest = st.nextToken();
162
163 StringBuilder humanReadablePart = new StringBuilder();
164 while (st.hasMoreTokens()) {
165 humanReadablePart.append(st.nextToken());
166 //test if the string might have "_" in the human readable part
167 if (st.hasMoreTokens()) {
168 humanReadablePart.append(delimiter);
169 }
170
171 }
172
173 int offset = 0;
174 int index;
175 while ((index = text.indexOf(humanReadablePart.toString(), offset)) != -1) {
176 StringBuilder message = new StringBuilder();
177
178 Span spanCandidate = new Span(index, index + anchoredPartLength);
179 //calculate the context boundaries
180 message.append(URIGeneratorHelper.getContextBefore(spanCandidate, text, contextLength));
181 message.append(BRA);
182 message.append(spanCandidate.getCoveredText(text));
183 message.append(KET);
184 message.append(URIGeneratorHelper.getContextAfter(spanCandidate, text, contextLength));
185
186 String digestNew = DigestUtils.md5Hex(message.toString());
187 if (digest.equals(digestNew)) {
188 return spanCandidate;
189 } else {
190 //try the next one
191 offset = index;
192 }
193 }
194 throw new RuntimeException("No matching string has been found in text");
195 }
196
197 public int getContextLength() {
198 return contextLength;
199 }
200
201 public void setContextLength(int contextLength) {
202 this.contextLength = contextLength;
203 }
204
205
206 /*
207 // the uri has been used in the same text already
208 if (false == collision.add(uri)) {
209 // if the context covers the whole text there is no sense in expanding anything
210 if (before == 0 && after == text.length()) {
211 log.warn("A non-unique String URI was discovered: " + uri + ". Anchored part was: " + anchoredPart + ". This normally only happens, because the code calling this object uses the same parameters for a second time.");
212 return uri;
213
214 } else {
215 //make the context bigger, this will guarantee uniqueness
216 contextLength++;
217 throw new StartOverException("found a duplicate URI (hash collision), increasing context to: " + contextLength);
218 }
219 } */
220
221 }