Menu

[r725]: / branches / revised-webservices / Src / UURIEncode.pas  Maximize  Restore  History

Download this file

245 lines (208 with data), 8.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
{
* UURIEncode.pas
*
* Routines that can encode and decode URIs according to RFC 3986.
*
* $Rev$
* $Date$
*
* ***** BEGIN LICENSE BLOCK *****
*
* Version: MPL 1.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
* the specific language governing rights and limitations under the License.
*
* The Original Code is UURIEncode.pas
*
* The Initial Developer of the Original Code is Peter Johnson
* (http://www.delphidabbler.com/).
*
* Portions created by the Initial Developer are Copyright (C) 2010 Peter
* Johnson. All Rights Reserved.
*
* Contributor(s)
* NONE
*
* ***** END LICENSE BLOCK *****
}
unit UURIEncode;
{$WARN UNSAFE_CODE OFF}
{$WARN EXPLICIT_STRING_CAST OFF}
{.$WARN EXPLICIT_STRING_CAST_LOSS OFF}
interface
const
// Chars reserved for URIs: see RFC 3986 section 2.2
// generic: reserved by generic URI syntax
cURIGenReservedChars = [
':', '/', '?', '#', '[', ']', '@'
];
// may be reserved by different URI schemes
cURISubReservedChars = [
'!', '$', '&', '''', '(', ')', '*', '+', ',', ';', '='
];
// % character treated as reserved because it is used in percent encoding and
// must therefore be percent encoded if to be used literally in URI:
// see RFC 3986 section 2.4
cPercent = '%';
// all the reserved chars: union of above
cURIReservedChars =
cURIGenReservedChars + cURISubReservedChars + [cPercent];
// Unreserved URI chars: see RFC 3986 section 2.3
cURLUnreservedChars = [
'A'..'Z', 'a'..'z', '0'..'9', '-', '_', '.', '~'
];
// Special reserved char used to encode spaces in query string encoding
cPlus = '+';
{
NOTE:
URIEncode should be applied to the component parts of the URI before they
are combined, not to the whole URI. See RFC 3986 section 2.4
}
function URIEncode(const S: UTF8String): string; overload;
function URIEncode(const S: UnicodeString): string; overload;
function URIEncode(const S: AnsiString): string; overload;
function URIEncodeQueryString(const S: UTF8String): string; overload;
function URIEncodeQueryString(const S: UnicodeString): string; overload;
function URIEncodeQueryString(const S: AnsiString): string; overload;
function URIDecode(const Str: string): string;
function URIDecodeQueryString(const Str: string): string;
implementation
uses
SysUtils, StrUtils;
resourcestring
rsEscapeError = 'String to be decoded contains invalid % escape sequence';
const
cPercentEncodedSpace = '%20';
{
Internally, URIDecode operates on UTF-8 strings for both input and output.
This lets us deal easily with any multi-byte characters in the input. There
SHOULDN'T be any such characters since all characters in a percent encoded URI
should map onto the unreserved characters from the ASCII character set.
However we need to allow for badly encoded URIs that *may* contain characters
outside this expected set.
UTF-8 also lets perform an easy test for '%' characters in input. Since '%'
can never occur and a UTF-8 continuation character we can simply test for the
actual character without worrying about if it is part of of a multibyte
character.
We use UTF-8 for the output string since UTF-8 should have been used to encode
the URI in the first place, therefore percent-encoded octets may map onto
UTF-8 continuation bytes. Using any other string type would give erroneous
results.
We convert interim UTF-8 result into the native string type before returning.
}
function URIDecode(const Str: string): string;
// Counts number of '%' characters in a UTF8 string
function CountPercent(const S: UTF8String): Integer;
var
Idx: Integer; // loops thru all octets of S
begin
Result := 0;
for Idx := 1 to Length(S) do
if S[Idx] = cPercent then
Inc(Result);
end;
var
SrcUTF8: UTF8String; // input string as UTF-8
SrcIdx: Integer; // index into source UTF-8 string
ResUTF8: UTF8String; // output string as UTF-8
ResIdx: Integer; // index into result UTF-8 string
Hex: string; // hex component of % encoding
ChValue: Integer; // character ordinal value from a % encoding
begin
// Convert input string to UTF-8
SrcUTF8 := UTF8Encode(Str);
// Size the decoded UTF-8 string: each 3 byte sequence starting with '%' is
// replaced by a single byte. All other bytes are copied unchanged.
SetLength(ResUTF8, Length(SrcUTF8) - 2 * CountPercent(SrcUTF8));
SrcIdx := 1;
ResIdx := 1;
while SrcIdx <= Length(SrcUTF8) do
begin
if SrcUTF8[SrcIdx] = cPercent then
begin
// % encoding: decode following two hex chars into required code point
if Length(SrcUTF8) < SrcIdx + 2 then
raise EConvertError.Create(rsEscapeError); // malformed: too short
Hex := '$' + string(SrcUTF8[SrcIdx + 1] + SrcUTF8[SrcIdx + 2]);
if not TryStrToInt(Hex, ChValue) then
raise EConvertError.Create(rsEscapeError); // malformed: not valid hex
ResUTF8[ResIdx] := AnsiChar(ChValue);
Inc(ResIdx);
Inc(SrcIdx, 3);
end
else
begin
// plain char or UTF-8 continuation character: copy unchanged
ResUTF8[ResIdx] := SrcUTF8[SrcIdx];
Inc(ResIdx);
Inc(SrcIdx);
end;
end;
// Convert back to native string type for result
Result := UTF8ToString(ResUTF8);
end;
function URIDecodeQueryString(const Str: string): string;
begin
// First replace plus signs with spaces (use percent-encoded spaces here
// because string is still URI encoded and space is not one of unreserved
// chars and therefor should be percent-encoded.
// Next decode the percent-encoded string.
Result := URIDecode(ReplaceStr(Str, cPlus, cPercentEncodedSpace));
end;
{
Extract from RFC 3986 section 2.5:
"the data should first be encoded as octets according to the UTF-8 character
encoding [STD63]; then only those octets that do not correspond to
characters in the unreserved set should be percent-encoded.
These means we can simply scan a UTF-8 string and encode anything we find that
isn't in the unreserved set. We need't worry about any continuation bytes in
the UTF-8 encoding because all continuation bytes are greater than $80, and
all unreserved characters are from a sub-set of ASCII and therefore have an
ordinal value of less than $80. So we needn't worry about detecting lead and
continuation bytes.
For details of the UTF-8 encoding see http://en.wikipedia.org/wiki/UTF-8
}
// Assumes Defined(UNICODE)
function URIEncode(const S: UTF8String): string; overload;
var
Ch: AnsiChar;
begin
// Just scan the string an octet at a time looking for chars to encode
Result := '';
for Ch in S do
if CharInSet(Ch, cURLUnreservedChars) then
Result := Result + WideChar(Ch)
else
Result := Result + '%' + IntToHex(Ord(Ch), 2);
end;
function URIEncode(const S: UnicodeString): string; overload;
begin
Result := URIEncode(UTF8Encode(S));
end;
function URIEncode(const S: AnsiString): string; overload;
begin
Result := URIEncode(UTF8Encode(S));
end;
function URIEncodeQueryString(const S: UTF8String): string; overload;
begin
// First we URI encode the string. This so any existing '+' symbols get
// encoded because we use them to replace spaces and we can't confuse '+'
// already in URI with those that we add. After this step spaces get encoded
// as %20. So next we replace all occurences of %20 with '+'.
Result := ReplaceStr(URIEncode(S), cPercentEncodedSpace, cPlus);
end;
function URIEncodeQueryString(const S: UnicodeString): string; overload;
begin
Result := URIEncodeQueryString(UTF8Encode(S));
end;
function URIEncodeQueryString(const S: AnsiString): string; overload;
begin
Result := URIEncodeQueryString(UTF8Encode(S));
end;
end.
Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.