001 /*
002 * OUtils.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Ian Roberts 05/05/2010
013 *
014 * $Id: OUtils.java 12589 2010-05-05 15:49:41Z ian_roberts $
015 *
016 * This class includes code from the com.hp.hpl.jena.util.URIref class of jena
017 * (http://jena.sourceforge.net) which is subject to the following licence:
018 *
019 * (c) Copyright Hewlett-Packard Company 2001
020 * All rights reserved.
021 *
022 * Redistribution and use in source and binary forms, with or without
023 * modification, are permitted provided that the following conditions
024 * are met:
025 * 1. Redistributions of source code must retain the above copyright
026 * notice, this list of conditions and the following disclaimer.
027 * 2. Redistributions in binary form must reproduce the above copyright
028 * notice, this list of conditions and the following disclaimer in the
029 * documentation and/or other materials provided with the distribution.
030 * 3. The name of the author may not be used to endorse or promote products
031 * derived from this software without specific prior written permission.
032
033 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
034 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
035 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
036 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
037 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
038 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
039 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
040 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
041 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
042 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
043 */
044 package gate.creole.ontology;
045
046 import java.util.regex.Pattern;
047
048 public class OUtils {
049
050 /**
051 * Private constructor - this class should not be instantiated.
052 */
053 private OUtils() {
054 }
055
056 /**
057 * Pattern for symbol and punctuation characters that are not recommended in
058 * URIs.
059 */
060 private static Pattern badPunctPattern =
061 Pattern.compile("[\\p{P}\\p{S}&&[^;\\?@&=\\+\\$,_\\.!~\\*\\(\\)\\-]]");
062
063 /**
064 * Pattern matching runs of whitespace.
065 */
066 private static Pattern spacesPattern =
067 Pattern.compile("\\s+");
068
069 /**
070 * Converts a string to a form suitable for use as a resource name by the
071 * {@link Ontology#createOURIForName} method. This is not a reversible
072 * encoding, but is intended to produce "readable" resource URIs in an
073 * ontology from English source strings. The process is:
074 * <ol>
075 * <li>Replace any slashes, colons, apostrophes and other non-URI-legal
076 * punctuation characters and unicode symbol characters with a space, i.e.
077 * any punctuation except ; ? @ & = + $ , - _ . ! ~ * ( )</li>
078 * <li>Convert any runs of whitespace characters to a single underscore</li>
079 * <li>{@link #uriEncode} the result.</li>
080 * </ol>
081 * For example, this would convert "John Smith" to "John_Smith", "Allen &
082 * Heath" to "Allen_&_Heath", "N/A" to "N_A", "32 °F" to "32_F", etc.
083 */
084 public static String toResourceName(String text) {
085 return uriEncode(spacesPattern.matcher(
086 badPunctPattern.matcher(text).replaceAll(" ")
087 ).replaceAll("_"));
088 }
089
090 /**
091 * Convert a Unicode string (which is assumed to represent a URI or URI
092 * fragment) to an RFC 2396-compliant URI reference by first converting it to
093 * bytes in UTF-8 and then encoding the resulting bytes as specified by the
094 * RFC. ASCII letters, numbers and the other characters that are permitted
095 * in URI references are left unchanged, existing %NN escape sequences are
096 * left unchanged, and any other characters are %-escaped as appropriate. In
097 * particular any % characters in the original string that are not part of a
098 * %NN escape sequence will themselves be encoded as %25.
099 *
100 * @param uri The uri, in characters specified by RFC 2396 + '#'
101 * @return The corresponding Unicode String
102 */
103 public static String uriEncode(String uriRef) {
104 try {
105 byte utf8[] = uriRef.getBytes("UTF-8");
106 byte rsltAscii[] = new byte[utf8.length*6];
107 int in = 0;
108 int out = 0;
109 while ( in < utf8.length ) {
110 switch ( utf8[in] ) {
111 case (byte)'a': case (byte)'b': case (byte)'c': case (byte)'d': case (byte)'e': case (byte)'f': case (byte)'g': case (byte)'h': case (byte)'i': case (byte)'j': case (byte)'k': case (byte)'l': case (byte)'m': case (byte)'n': case (byte)'o': case (byte)'p': case (byte)'q': case (byte)'r': case (byte)'s': case (byte)'t': case (byte)'u': case (byte)'v': case (byte)'w': case (byte)'x': case (byte)'y': case (byte)'z':
112 case (byte)'A': case (byte)'B': case (byte)'C': case (byte)'D': case (byte)'E': case (byte)'F': case (byte)'G': case (byte)'H': case (byte)'I': case (byte)'J': case (byte)'K': case (byte)'L': case (byte)'M': case (byte)'N': case (byte)'O': case (byte)'P': case (byte)'Q': case (byte)'R': case (byte)'S': case (byte)'T': case (byte)'U': case (byte)'V': case (byte)'W': case (byte)'X': case (byte)'Y': case (byte)'Z':
113 case (byte)'0': case (byte)'1': case (byte)'2': case (byte)'3': case (byte)'4': case (byte)'5': case (byte)'6': case (byte)'7': case (byte)'8': case (byte)'9':
114 case (byte)';': case (byte)'/': case (byte)'?': case (byte)':': case (byte)'@': case (byte)'&': case (byte)'=': case (byte)'+': case (byte)'$': case (byte)',':
115 case (byte)'-': case (byte)'_': case (byte)'.': case (byte)'!': case (byte)'~': case (byte)'*': case (byte)'\'': case (byte)'(': case (byte)')':
116 case (byte)'#':
117 case (byte)'[': case (byte)']':
118 rsltAscii[out] = utf8[in];
119 out++;
120 in++;
121 break;
122 case (byte) '%':
123 try {
124 if ( in+2 < utf8.length ) {
125 byte first = hexEncode(hexDecode(utf8[in+1]));
126 byte second = hexEncode(hexDecode(utf8[in+2]));
127 rsltAscii[out++] = (byte)'%';
128 rsltAscii[out++] = first;
129 rsltAscii[out++] = second;
130 in += 3;
131 break;
132 }
133 }
134 catch (IllegalArgumentException e) {
135 // Illformed - should issue message ....
136 //Original JENA class prints a warning here, we want to
137 //ignore the error and simply encode bare % signs as %25
138 //
139 // Fall through.
140 }
141 default:
142 rsltAscii[out++] = (byte)'%';
143 // Get rid of sign ...
144 int c = ((int)utf8[in])&255;
145 rsltAscii[out++] = hexEncode( c/16 );
146 rsltAscii[out++] = hexEncode( c%16 );
147 in++;
148 break;
149 }
150 }
151 return new String(rsltAscii,0,out,"US-ASCII");
152 }
153 catch ( java.io.UnsupportedEncodingException e ) {
154 throw new Error( "The JVM is required to support UTF-8 and US-ASCII encodings.");
155 }
156 }
157
158 /**
159 * Convert a URI reference (URI or URI fragment), in US-ASCII, with escaped
160 * characters taken from UTF-8, to the corresponding Unicode string.
161 * On ill-formed input the results are undefined, specifically if
162 * the unescaped version is not a UTF-8 String, some String will be
163 * returned.
164 * @param uri The uri, in characters specified by RFC 2396 + '#'.
165 * @return The corresponding Unicode String.
166 * @exception IllegalArgumentException If a % hex sequence is ill-formed.
167 */
168 public static String uriDecode(String uri) {
169 try {
170 byte ascii[] = uri.getBytes("US-ASCII");
171 byte utf8[] = new byte[ascii.length];
172 int in = 0;
173 int out = 0;
174 while ( in < ascii.length ) {
175 // Original JENA class left escaped percent signs (%25)
176 // untouched, we convert them back to plain %
177 if ( ascii[in] == (byte)'%' ) {
178 in++;
179 utf8[out++] = (byte)(hexDecode(ascii[in])*16 | hexDecode(ascii[in+1]));
180 in += 2;
181 } else {
182 utf8[out++] = ascii[in++];
183 }
184 }
185 return new String(utf8,0,out,"UTF-8");
186 }
187 catch ( java.io.UnsupportedEncodingException e ) {
188 throw new Error( "The JVM is required to support UTF-8 and US-ASCII encodings.");
189 }
190 catch ( ArrayIndexOutOfBoundsException ee ) {
191 throw new IllegalArgumentException("Incomplete Hex escape sequence in " + uri );
192 }
193 }
194
195 private static final byte hexEncode(int i ) {
196 if (i<10)
197 return (byte) ('0' + i);
198 else
199 return (byte)('A' + i - 10);
200 }
201
202 private static final int hexDecode(byte b ) {
203 switch (b) {
204 case (byte)'a': case (byte)'b': case (byte)'c': case (byte)'d': case (byte)'e': case (byte)'f':
205 return (((int)b)&255)-'a'+10;
206 case (byte)'A': case (byte)'B': case (byte)'C': case (byte)'D': case (byte)'E': case (byte)'F':
207 return b - (byte)'A' + 10;
208 case (byte)'0': case (byte)'1': case (byte)'2': case (byte)'3': case (byte)'4': case (byte)'5': case (byte)'6': case (byte)'7': case (byte)'8': case (byte)'9':
209 return b - (byte)'0';
210 default:
211 throw new IllegalArgumentException("Bad Hex escape character: " + (((int)b)&255) );
212 }
213 }
214 }
|