001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.core.impl;
023    
024    import com.hp.hpl.jena.ontology.OntModel;
025    import com.hp.hpl.jena.rdf.model.Model;
026    import com.jamonapi.Monitor;
027    import com.jamonapi.MonitorFactory;
028    import eu.lod2.nlp2rdf.schema.str.ContextHashBasedString;
029    import org.apache.commons.codec.digest.DigestUtils;
030    import org.nlp2rdf.core.Span;
031    import org.nlp2rdf.core.URIGenerator;
032    import org.nlp2rdf.core.util.URIGeneratorHelper;
033    import org.slf4j.Logger;
034    import org.slf4j.LoggerFactory;
035    
036    import java.security.InvalidParameterException;
037    import java.util.HashSet;
038    import java.util.Set;
039    import java.util.StringTokenizer;
040    
041    /**
042     * @author Sebastian Hellmann
043     *         <p/>
044     *         <p/>
045     *         This class implements the NIF Context-Hash URI Scheme.
046     *         http://nlp2rdf.org/nif-1-0#toc-nif-recipe-context-hash-based-uris
047     *         The initial contextLength is set to 10
048     *         <p/>
049     *         <p/>
050     *         To change this either call init(), which calculates the required minimal contextlength for all uris to be unique for this document.
051     *         or
052     *         use the constructor
053     *         <p/>
054     *         There is no reason, why this would be threaded, so it is not threadsafe
055     */
056    public class MD5Based extends AbstractURIGenerator implements URIGenerator {
057        private static Logger log = LoggerFactory.getLogger(MD5Based.class);
058        public static final String IDENTIFIER = "hash";
059        public static final String BRA = "(";
060        public static final String KET = ")";
061        protected int contextLength = 10;
062    
063        public MD5Based() {
064            this(10);
065        }
066    
067        public MD5Based(int contextLength) {
068            this.contextLength = contextLength;
069        }
070    
071        public MD5Based(String text, Set<Span> allSpans) {
072            setMinimalContextLength(text, allSpans);
073        }
074    
075        public MD5Based(String prefix, OntModel model) {
076            String delimiter = "_";
077            StringTokenizer st = new StringTokenizer(ContextHashBasedString.list(model).get(0).getURI().substring(prefix.length()), delimiter);
078            if (!(st.nextToken().equalsIgnoreCase(IDENTIFIER))) {
079                throw new InvalidParameterException("The span could not be recognized correctly: " + ContextHashBasedString.list(model).get(0) + " with prefix " + prefix);
080            }
081            contextLength = Integer.parseInt(st.nextToken());
082        }
083    
084        @Override
085        public String getRecipeUri() {
086            return "http://nlp2rdf.lod2.eu/schema/string/ContextHashBasedString";
087        }
088    
089        @Override
090        public void assignRecipeClass(String uri, OntModel model) {
091            ContextHashBasedString.create(uri, model);
092        }
093    
094        @Override
095        public String makeUri(String prefix, String text, Span span) {
096    
097            //the substring
098            String anchoredPart = span.getCoveredText(text).toString();
099    
100            StringBuilder message = new StringBuilder();
101            //calculate the context boundaries
102            message.append(URIGeneratorHelper.getContextBefore(span, text, contextLength));
103            message.append(BRA);
104            message.append(anchoredPart);
105            message.append(KET);
106            message.append(URIGeneratorHelper.getContextAfter(span, text, contextLength));
107    
108            String digest = DigestUtils.md5Hex(message.toString());
109            String firstChars = URIGeneratorHelper.getFirstCharacters(anchoredPart, firstCharLength);
110            StringBuilder uri = new StringBuilder();
111            uri.append(prefix);
112            uri.append(IDENTIFIER).append("_");
113            uri.append(contextLength).append("_");
114            uri.append(anchoredPart.length()).append("_");
115            uri.append(digest).append("_");
116            uri.append(firstChars);
117    
118            if (log.isTraceEnabled()) {
119                log.trace("Text (" + text.length() + " chars): " + text);
120                log.trace("Word (" + span.getCoveredText(text).length() + " chars): " + span.getCoveredText(text));
121                log.trace("Span: " + span.getStart() + "|" + span.getEnd());
122                //log.trace("Before|After: " + before + "|" + after);
123                log.trace("Context (" + contextLength + ") before: |" + URIGeneratorHelper.getContextBefore(span, text, contextLength));
124                log.trace("Context (" + contextLength + ") after: |" + URIGeneratorHelper.getContextAfter(span, text, contextLength) + "|");
125                log.trace("Message: |" + message.toString() + "|");
126                log.trace("URI: " + uri.toString());
127            }
128    
129            return uri.toString();
130        }
131    
132    
133        public void setMinimalContextLength(String text, Set<Span> spans) {
134            Monitor mon = MonitorFactory.getTimeMonitor(this.getClass().getSimpleName() + "init").start();
135            repeat(text, spans);
136            log.info("Minimal context calculated: " + contextLength + " needed: " + mon.stop().getLastValue() + " ms. ");
137        }
138    
139        private void repeat(String text, Set<Span> allSpans) {
140            Set<String> collision = new HashSet<String>();
141            for (Span span : allSpans) {
142                if (false == collision.add(makeUri("", text, span))) {
143                    contextLength++;
144                    repeat(text, allSpans);
145                    return;
146                }
147            }
148        }
149    
150    
151        @Override
152        public Span getSpanFor(String prefix, String uri, String text) {
153            String delimiter = "_";
154            StringTokenizer st = new StringTokenizer(uri.substring(prefix.length()), delimiter);
155            if (!(st.nextToken().equalsIgnoreCase(IDENTIFIER))) {
156                throw new InvalidParameterException("The span could not be recognized correctly: " + uri + " with prefix " + prefix);
157            }
158    
159            int contextLength = Integer.parseInt(st.nextToken());
160            int anchoredPartLength = Integer.parseInt(st.nextToken());
161            String digest = st.nextToken();
162    
163            StringBuilder humanReadablePart = new StringBuilder();
164            while (st.hasMoreTokens()) {
165                humanReadablePart.append(st.nextToken());
166                //test if the string might have "_" in the human readable part
167                if (st.hasMoreTokens()) {
168                    humanReadablePart.append(delimiter);
169                }
170    
171            }
172    
173            int offset = 0;
174            int index;
175            while ((index = text.indexOf(humanReadablePart.toString(), offset)) != -1) {
176                StringBuilder message = new StringBuilder();
177    
178                Span spanCandidate = new Span(index, index + anchoredPartLength);
179                //calculate the context boundaries
180                message.append(URIGeneratorHelper.getContextBefore(spanCandidate, text, contextLength));
181                message.append(BRA);
182                message.append(spanCandidate.getCoveredText(text));
183                message.append(KET);
184                message.append(URIGeneratorHelper.getContextAfter(spanCandidate, text, contextLength));
185    
186                String digestNew = DigestUtils.md5Hex(message.toString());
187                if (digest.equals(digestNew)) {
188                    return spanCandidate;
189                } else {
190                    //try the next one
191                    offset = index;
192                }
193            }
194            throw new RuntimeException("No matching string has been found in text");
195        }
196    
197        public int getContextLength() {
198            return contextLength;
199        }
200    
201        public void setContextLength(int contextLength) {
202            this.contextLength = contextLength;
203        }
204    
205    
206        /*
207       // the uri has been used in the same text already
208       if (false == collision.add(uri)) {
209        // if the context covers the whole text there is no sense in expanding anything
210        if (before == 0 && after == text.length()) {
211            log.warn("A non-unique String URI was discovered: " + uri + ". Anchored part was: " + anchoredPart + ". This normally only happens, because the code calling this object uses the same parameters for a second time.");
212            return uri;
213    
214        } else {
215            //make the context bigger, this will guarantee uniqueness
216            contextLength++;
217            throw new StartOverException("found a duplicate URI (hash collision), increasing context to: " + contextLength);
218        }
219       } */
220    
221    }