001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.core.impl;
023    
024    import com.hp.hpl.jena.ontology.OntModel;
025    import com.jamonapi.Monitor;
026    import com.jamonapi.MonitorFactory;
027    import eu.lod2.nlp2rdf.schema.str.ContextHashBasedString;
028    import opennlp.tools.util.Span;
029    import org.apache.commons.codec.digest.DigestUtils;
030    import org.nlp2rdf.core.URIGenerator;
031    import org.nlp2rdf.core.util.URIGeneratorHelper;
032    import org.slf4j.Logger;
033    import org.slf4j.LoggerFactory;
034    
035    import java.security.InvalidParameterException;
036    import java.util.HashSet;
037    import java.util.Set;
038    import java.util.StringTokenizer;
039    
040    /**
041     * @author Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
042     *         <p/>
043     *         This class implements the NIF Context-hash URI recipe.
044     *         Before creating URIs init should be called, because this sets the correct value for contextLength
045     *         <p/>
046     *         There is no reason, why this would be threaded, so it is not threadsafe
047     */
048    public class MD5Based extends AbstractURIGenerator implements URIGenerator {
049        private static Logger log = LoggerFactory.getLogger(MD5Based.class);
050        public static final String bra = "(";
051        public static final String ket = ")";
052    
053        int firstCharLength = 20;
054        int contextLength = 0;
055        public static final String identifier = "hash";
056    
057        @Override
058        public String getRecipeUri() {
059            return "http://nlp2rdf.lod2.eu/schema/string/ContextHashBasedString";
060        }
061    
062        @Override
063        public void assignRecipeClass(String uri, OntModel model) {
064            ContextHashBasedString.create(uri, model);
065        }
066    
067        @Override
068        public String makeUri(String prefix, String text, Span span) {
069    
070            //the substring
071            String anchoredPart = span.getCoveredText(text).toString();
072    
073            StringBuilder message = new StringBuilder();
074            //calculate the context boundaries
075            message.append(URIGeneratorHelper.getContextBefore(span, text, contextLength)).append(bra).append(anchoredPart).append(ket).append(URIGeneratorHelper.getContextAfter(span, text, contextLength));
076    
077            String digest = DigestUtils.md5Hex(message.toString());
078            String firstChars = URIGeneratorHelper.getFirstCharacters(anchoredPart, firstCharLength);
079            StringBuilder uri = new StringBuilder();
080            uri.append(prefix);
081            uri.append(identifier).append("_");
082            uri.append(contextLength).append("_");
083            uri.append(anchoredPart.length()).append("_");
084            uri.append(digest).append("_");
085            uri.append(firstChars);
086    
087            if (log.isTraceEnabled()) {
088                log.trace("Text (" + text.length() + " chars): " + text);
089                log.trace("Word (" + span.getCoveredText(text).length() + " chars): " + span.getCoveredText(text));
090                log.trace("Span: " + span.getStart() + "|" + span.getEnd());
091                //log.trace("Before|After: " + before + "|" + after);
092                log.trace("Context (" + contextLength + ") before: |" + URIGeneratorHelper.getContextBefore(span, text, contextLength));
093                log.trace("Context (" + contextLength + ") after: |" + URIGeneratorHelper.getContextAfter(span, text, contextLength) + "|");
094                log.trace("Message: |" + message.toString() + "|");
095                log.trace("URI: " + uri.toString());
096            }
097    
098            return uri.toString();
099        }
100    
101    
102        @Override
103        public void init(String text, Set<Span> spans) {
104            Monitor mon = MonitorFactory.getTimeMonitor(this.getClass().getSimpleName() + "init").start();
105            repeat(text, spans);
106            log.info("Optimal context calculated: " + contextLength + " needed: " + mon.stop().getLastValue() + " ms. ");
107        }
108    
109        private void repeat(String text, Set<Span> spans) {
110            Set<String> collision = new HashSet<String>();
111            for (Span span : spans) {
112                if (false == collision.add(makeUri("", text, span))) {
113                    contextLength++;
114                    repeat(text, spans);
115                    return;
116                }
117            }
118        }
119    
120    
121        @Override
122        public Span getSpanFor(String prefix, String uri, String text, String anchoredPart) {
123            StringTokenizer st = new StringTokenizer(uri.substring(prefix.length()), "_");
124            if (!(st.nextToken().equalsIgnoreCase(identifier))) {
125                throw new InvalidParameterException("The span could not be recognized correctly: " + uri + " with prefix " + prefix);
126            }
127    
128            int contextLength = Integer.parseInt(st.nextToken());
129            int anchoredPartLength = Integer.parseInt(st.nextToken());
130            String hash = st.nextToken();
131    
132            if (true) {
133                throw new RuntimeException("getSpan is not yet implemented for hash based uris");
134            }
135            return new Span(0, 0);
136        }
137    
138    
139    /*
140    // the uri has been used in the same text already
141    if (false == collision.add(uri)) {
142      // if the context covers the whole text there is no sense in expanding anything
143      if (before == 0 && after == text.length()) {
144          log.warn("A non-unique String URI was discovered: " + uri + ". Anchored part was: " + anchoredPart + ". This normally only happens, because the code calling this object uses the same parameters for a second time.");
145          return uri;
146    
147      } else {
148          //make the context bigger, this will guarantee uniqueness
149          contextLength++;
150          throw new StartOverException("found a duplicate URI (hash collision), increasing context to: " + contextLength);
151      }
152    } */
153    
154    }