Menu

[r3166]: / trunk / Src / UURIEncode.pas  Maximize  Restore  History

Download this file

311 lines (269 with data), 11.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
{
* This Source Code Form is subject to the terms of the Mozilla Public License,
* v. 2.0. If a copy of the MPL was not distributed with this file, You can
* obtain one at http://mozilla.org/MPL/2.0/
*
* Copyright (C) 2010-2012, Peter Johnson (www.delphidabbler.com).
*
* $Rev$
* $Date$
*
* Routines that can encode and decode URIs according to RFC 3986.
}
unit UURIEncode;
interface
const
// Chars reserved for URIs: see RFC 3986 section 2.2
// generic: reserved by generic URI syntax
cURIGenReservedChars = [
':', '/', '?', '#', '[', ']', '@'
];
// may be reserved by different URI schemes
cURISubReservedChars = [
'!', '$', '&', '''', '(', ')', '*', '+', ',', ';', '='
];
// % character treated as reserved because it is used in percent encoding and
// must therefore be percent encoded if to be used literally in URI:
// see RFC 3986 section 2.4
cPercent = '%';
// all the reserved chars: union of above
cURIReservedChars =
cURIGenReservedChars + cURISubReservedChars + [cPercent];
// Unreserved URI chars: see RFC 3986 section 2.3
cURLUnreservedChars = [
'A'..'Z', 'a'..'z', '0'..'9', '-', '_', '.', '~'
];
// Special reserved char used to encode spaces in query string encoding
cPlus = '+';
function URIEncode(const S: UTF8String): string; overload;
{URI encodes a string.
@param S [in] String of UTF-8 characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
function URIEncode(const S: UnicodeString): string; overload;
{URI encodes a string.
@param S [in] String of Unicode characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
function URIEncode(const S: AnsiString): string; overload;
{URI encodes a string.
@param S [in] String of Unicode characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
function URIEncodeQueryString(const S: UTF8String): string; overload;
{URI encodes a query string component. Spaces in original string are encoded
as "+".
@param S [in] String of UTF-8 characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
function URIEncodeQueryString(const S: UnicodeString): string; overload;
{URI encodes a query string component. Spaces in original string are encoded
as "+".
@param S [in] String of Unicode characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
function URIEncodeQueryString(const S: AnsiString): string; overload;
{URI encodes a query string component. Spaces in original string are encoded
as "+".
@param S [in] String of ANSI characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
function URIDecode(const Str: string): string;
{Decodes a URI encoded string.
@param Str [in] String to be decoded. *May* but should not contain
characters outside the unreserved character set per RFC 3986.
@return Decoded string.
@except EConvertError raised if Str contains malformed % escape sequences.
}
function URIDecodeQueryString(const Str: string): string;
{Decodes a URI encoded query string where spaces have been encoded as '+'.
@param Str [in] String to be decoded. *May* but should not contain
characters outside the unreserved character set per RFC 3986.
@return Decoded string.
@except EConvertError raised if Str contains malformed % escape sequences.
}
implementation
uses
// Delphi
SysUtils,
// Project
UStrUtils;
resourcestring
// Error messages
rsEscapeError = 'String to be decoded contains invalid % escape sequence';
const
// Percent encoding of space character
cPercentEncodedSpace = '%20';
{
Internally, URIDecode operates on UTF-8 strings for both input and output.
This lets us deal easily with any multi-byte characters in the input. There
SHOULDN'T be any such characters since all characters in a percent encoded URI
should map onto the unreserved characters from the ASCII character set.
However we need to allow for badly encoded URIs that *may* contain characters
outside this expected set.
UTF-8 also lets perform an easy test for '%' characters in input. Since '%'
can never occur and a UTF-8 continuation character we can simply test for the
actual character without worrying about if it is part of of a multibyte
character.
We use UTF-8 for the output string since UTF-8 should have been used to encode
the URI in the first place, therefore percent-encoded octets may map onto
UTF-8 continuation bytes. Using any other string type would give erroneous
results.
We convert interim UTF-8 result into the native string type before returning.
}
function URIDecode(const Str: string): string;
{Decodes a URI encoded string.
@param Str [in] String to be decoded. *May* but should not contain
characters outside the unreserved character set per RFC 3986.
@return Decoded string.
@except EConvertError raised if Str contains malformed % escape sequences.
}
// ---------------------------------------------------------------------------
function CountPercent(const S: UTF8String): Integer;
{Counts number of '%' characters in a UTF8 string.
@param S [in] String for which '%' characters to be counted.
@return Number of '%' characters in S.
}
var
Idx: Integer; // loops thru all octets of S
begin
Result := 0;
for Idx := 1 to Length(S) do
if S[Idx] = cPercent then
Inc(Result);
end;
// ---------------------------------------------------------------------------
var
SrcUTF8: UTF8String; // input string as UTF-8
SrcIdx: Integer; // index into source UTF-8 string
ResUTF8: UTF8String; // output string as UTF-8
ResIdx: Integer; // index into result UTF-8 string
Hex: string; // hex component of % encoding
ChValue: Integer; // character ordinal value from a % encoding
begin
// Convert input string to UTF-8
SrcUTF8 := UTF8Encode(Str);
// Size the decoded UTF-8 string: each 3 byte sequence starting with '%' is
// replaced by a single byte. All other bytes are copied unchanged.
SetLength(ResUTF8, Length(SrcUTF8) - 2 * CountPercent(SrcUTF8));
SrcIdx := 1;
ResIdx := 1;
while SrcIdx <= Length(SrcUTF8) do
begin
if SrcUTF8[SrcIdx] = cPercent then
begin
// % encoding: decode following two hex chars into required code point
if Length(SrcUTF8) < SrcIdx + 2 then
raise EConvertError.Create(rsEscapeError); // malformed: too short
Hex := '$' + string(SrcUTF8[SrcIdx + 1] + SrcUTF8[SrcIdx + 2]);
if not TryStrToInt(Hex, ChValue) then
raise EConvertError.Create(rsEscapeError); // malformed: not valid hex
ResUTF8[ResIdx] := AnsiChar(ChValue);
Inc(ResIdx);
Inc(SrcIdx, 3);
end
else
begin
// plain char or UTF-8 continuation character: copy unchanged
ResUTF8[ResIdx] := SrcUTF8[SrcIdx];
Inc(ResIdx);
Inc(SrcIdx);
end;
end;
// Convert back to native string type for result
Result := UTF8ToString(ResUTF8);
end;
function URIDecodeQueryString(const Str: string): string;
{Decodes a URI encoded query string where spaces have been encoded as '+'.
@param Str [in] String to be decoded. *May* but should not contain
characters outside the unreserved character set per RFC 3986.
@return Decoded string.
@except EConvertError raised if Str contains malformed % escape sequences.
}
begin
// First replace plus signs with spaces. We use percent-encoded spaces here
// because string is still URI encoded and space is not one of unreserved
// chars and therefor should be percent-encoded. Finally we decode the
// percent-encoded string.
Result := URIDecode(StrReplace(Str, cPlus, cPercentEncodedSpace));
end;
{
Extract from RFC 3986 section 2.5:
"the data should first be encoded as octets according to the UTF-8 character
encoding [STD63]; then only those octets that do not correspond to
characters in the unreserved set should be percent-encoded.
These means we can simply scan a UTF-8 string and encode anything we find that
isn't in the unreserved set. We need't worry about any continuation bytes in
the UTF-8 encoding because all continuation bytes are greater than $80, and
all unreserved characters are from a sub-set of ASCII and therefore have an
ordinal value of less than $80. So we needn't worry about detecting lead and
continuation bytes.
For details of the UTF-8 encoding see http://en.wikipedia.org/wiki/UTF-8
NOTE:
URIEncode should be applied to the component parts of the URI before they
are combined, not to the whole URI. See RFC 3986 section 2.4
}
function URIEncode(const S: UTF8String): string; overload;
{URI encodes a string.
@param S [in] String of UTF-8 characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
var
Ch: AnsiChar; // each character in S
begin
// Just scan the string an octet at a time looking for chars to encode
Result := '';
for Ch in S do
if CharInSet(Ch, cURLUnreservedChars) then
Result := Result + WideChar(Ch)
else
Result := Result + '%' + IntToHex(Ord(Ch), 2);
end;
function URIEncode(const S: UnicodeString): string; overload;
{URI encodes a string.
@param S [in] String of Unicode characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
begin
Result := URIEncode(UTF8Encode(S));
end;
function URIEncode(const S: AnsiString): string; overload;
{URI encodes a string.
@param S [in] String of ANSI characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
begin
Result := URIEncode(UTF8Encode(S));
end;
function URIEncodeQueryString(const S: UTF8String): string; overload;
{URI encodes a query string component. Spaces in original string are encoded
as "+".
@param S [in] String of UTF-8 characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
begin
// First we URI encode the string. This so any existing '+' symbols get
// encoded because we use them to replace spaces and we can't confuse '+'
// already in URI with those that we add. After this step spaces get encoded
// as %20. So next we replace all occurences of %20 with '+'.
Result := StrReplace(URIEncode(S), cPercentEncodedSpace, cPlus);
end;
function URIEncodeQueryString(const S: UnicodeString): string; overload;
{URI encodes a query string component. Spaces in original string are encoded
as "+".
@param S [in] String of Unicode characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
begin
Result := URIEncodeQueryString(UTF8Encode(S));
end;
function URIEncodeQueryString(const S: AnsiString): string; overload;
{URI encodes a query string component. Spaces in original string are encoded
as "+".
@param S [in] String of ANSI characters to be encoded.
@return Encoded string. Contains only ASCII unreserved characters and "%".
}
begin
Result := URIEncodeQueryString(UTF8Encode(S));
end;
end.
Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.