(library (rnrs bytevectors private string)
(export string->utf8 string->utf16 string->utf32
utf8->string utf16->string utf32->string)
(import (rnrs base)
(rnrs control)
(rnrs arithmetic bitwise)
(rnrs mutable-strings)
(rnrs r5rs)
(rnrs bytevectors private core)
(rnrs bytevectors private proto))
(define (string->utf8 string)
(let* ((n (string-length string))
(k (do ((i 0 (+ i 1))
(k 0 (+ k (let ((sv (char->integer (string-ref string i))))
(cond ((<= sv #x007f) 1)
((<= sv #x07ff) 2)
((<= sv #xffff) 3)
(else 4))))))
((= i n) k)))
(bv (make-bytevector k)))
(define (loop i j)
(if (= i n)
bv
(let ((sv (char->integer (string-ref string i))))
(cond ((<= sv #x007f)
(bytevector-u8-set! bv j sv)
(loop (+ i 1) (+ j 1)))
((<= sv #x07ff)
(let ((u0 (bitwise-ior #b11000000
(bitwise-bit-field sv 6 11)))
(u1 (bitwise-ior #b10000000
(bitwise-bit-field sv 0 6))))
(bytevector-u8-set! bv j u0)
(bytevector-u8-set! bv (+ j 1) u1)
(loop (+ i 1) (+ j 2))))
((<= sv #xffff)
(let ((u0 (bitwise-ior #b11100000
(bitwise-bit-field sv 12 16)))
(u1 (bitwise-ior #b10000000
(bitwise-bit-field sv 6 12)))
(u2 (bitwise-ior #b10000000
(bitwise-bit-field sv 0 6))))
(bytevector-u8-set! bv j u0)
(bytevector-u8-set! bv (+ j 1) u1)
(bytevector-u8-set! bv (+ j 2) u2)
(loop (+ i 1) (+ j 3))))
(else
(let ((u0 (bitwise-ior #b11110000
(bitwise-bit-field sv 18 21)))
(u1 (bitwise-ior #b10000000
(bitwise-bit-field sv 12 18)))
(u2 (bitwise-ior #b10000000
(bitwise-bit-field sv 6 12)))
(u3 (bitwise-ior #b10000000
(bitwise-bit-field sv 0 6))))
(bytevector-u8-set! bv j u0)
(bytevector-u8-set! bv (+ j 1) u1)
(bytevector-u8-set! bv (+ j 2) u2)
(bytevector-u8-set! bv (+ j 3) u3)
(loop (+ i 1) (+ j 4))))))))
(loop 0 0)))
(define (utf8->string bv)
(let* ((n (bytevector-length bv))
(replacement-character (integer->char #xfffd))
(bits->char (lambda (bits)
(cond ((<= 0 bits #xd7ff)
(integer->char bits))
((<= #xe000 bits #x10ffff)
(integer->char bits))
(else
replacement-character))))
(begins-with-bom?
(and (<= 3 n)
(= #xef (bytevector-u8-ref bv 0))
(= #xbb (bytevector-u8-ref bv 1))
(= #xbf (bytevector-u8-ref bv 2)))))
(define (result-length)
(define (q0 i k)
(if (= i n)
k
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1))
(k1 (+ k 1)))
(cond ((<= unit #x7f)
(q0 i1 k1))
((<= unit #xc1)
(q0 i1 k1))
((<= unit #xdf)
(q1 i1 k1))
((<= unit #xe0)
(q2 i1 k1 #xa0))
((<= unit #xef)
(q2 i1 k1 #x80))
((<= unit #xf0)
(q3 i1 k1 #x90 #xbf))
((<= unit #xf3)
(q3 i1 k1 #x80 #xbf))
((<= unit #xf4)
(q3 i1 k1 #x80 #x8f))
(else
(q0 i1 k1))))))
(define (q1 i k)
(if (= i n)
k
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1)))
(cond ((< unit #x80)
(q0 i k))
((<= unit #xbf)
(q0 i1 k))
(else
(q0 i k))))))
(define (q2 i k lower)
(if (= i n)
k
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1)))
(cond ((< unit lower)
(q0 i k))
((<= unit #xbf)
(q1 i1 k))
(else
(q0 i k))))))
(define (q3 i k lower upper)
(if (= i n)
k
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1)))
(cond ((< unit lower)
(q0 i k))
((<= unit upper)
(q2 i1 k #x80))
(else
(q0 i k))))))
(if begins-with-bom?
(q0 3 0)
(q0 0 0)))
(let* ((k (result-length))
(s (make-string k)))
(define (q0 i k)
(if (< i n)
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1))
(k1 (+ k 1)))
(cond ((<= unit #x7f)
(string-set! s k (integer->char unit))
(q0 i1 k1))
((<= unit #xc1)
(string-set! s k replacement-character)
(q0 i1 k1))
((<= unit #xdf)
(q1 i1 k (bitwise-and unit #x1f)))
((<= unit #xe0)
(q2 i1 k #xa0 0))
((<= unit #xef)
(q2 i1 k #x80 (bitwise-and unit #x0f)))
((<= unit #xf0)
(q3 i1 k #x90 #xbf 0))
((<= unit #xf3)
(q3 i1 k #x80 #xbf (bitwise-and unit #x07)))
((<= unit #xf4)
(q3 i1 k #x80 #x8f (bitwise-and unit #x07)))
(else
(string-set! s k replacement-character)
(q0 i1 k1))))))
(define (q1 i k bits)
(if (= i n)
(string-set! s k replacement-character)
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1))
(k1 (+ k 1)))
(cond ((< unit #x80)
(string-set! s k replacement-character)
(q0 i k1))
((<= unit #xbf)
(string-set! s k (bits->char
(bitwise-ior
(bitwise-arithmetic-shift-left bits 6)
(bitwise-and unit #x3f))))
(q0 i1 k1))
(else
(string-set! s k replacement-character)
(q0 i k1))))))
(define (q2 i k lower bits)
(if (= i n)
(string-set! s k replacement-character)
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1)))
(cond ((< unit lower)
(string-set! s k replacement-character)
(q0 i (+ k 1)))
((<= unit #x00bf)
(q1 i1 k (bitwise-ior
(bitwise-arithmetic-shift-left bits 6)
(bitwise-and unit #x3f))))
(else
(string-set! s k replacement-character)
(q0 i (+ k 1)))))))
(define (q3 i k lower upper bits)
(if (= i n)
(string-set! s k replacement-character)
(let ((unit (bytevector-u8-ref bv i))
(i1 (+ i 1)))
(cond ((< unit lower)
(string-set! s k replacement-character)
(q0 i (+ k 1)))
((<= unit upper)
(q2 i1 k #x80 (bitwise-ior
(bitwise-arithmetic-shift-left bits 6)
(bitwise-and unit #x3f))))
(else
(string-set! s k replacement-character)
(q0 i (+ k 1)))))))
(if begins-with-bom?
(q0 3 0)
(q0 0 0))
s)))
(define (string->utf16 string . rest)
(let* ((endianness (cond ((null? rest) 'big)
((not (null? (cdr rest)))
(apply assertion-violation 'string->utf16
"too many arguments" string rest))
((eq? (car rest) 'big) 'big)
((eq? (car rest) 'little) 'little)
(else (endianness-violation
'string->utf16
(car rest)))))
(hi (if (eq? 'big endianness) 0 1))
(lo (- 1 hi))
(n (string-length string)))
(define (result-length)
(do ((i 0 (+ i 1))
(k 0 (let ((sv (char->integer (string-ref string i))))
(if (< sv #x10000) (+ k 2) (+ k 4)))))
((= i n) k)))
(let ((bv (make-bytevector (result-length))))
(define (loop i k)
(if (< i n)
(let ((sv (char->integer (string-ref string i))))
(if (< sv #x10000)
(let ((hibits (bitwise-bit-field sv 8 16))
(lobits (bitwise-bit-field sv 0 8)))
(bytevector-u8-set! bv (+ k hi) hibits)
(bytevector-u8-set! bv (+ k lo) lobits)
(loop (+ i 1) (+ k 2)))
(let* ((x (- sv #x10000))
(hibits (bitwise-bit-field x 10 20))
(lobits (bitwise-bit-field x 0 10))
(hi16 (bitwise-ior #xd800 hibits))
(lo16 (bitwise-ior #xdc00 lobits))
(hi1 (bitwise-bit-field hi16 8 16))
(lo1 (bitwise-bit-field hi16 0 8))
(hi2 (bitwise-bit-field lo16 8 16))
(lo2 (bitwise-bit-field lo16 0 8)))
(bytevector-u8-set! bv (+ k hi) hi1)
(bytevector-u8-set! bv (+ k lo) lo1)
(bytevector-u8-set! bv (+ k hi 2) hi2)
(bytevector-u8-set! bv (+ k lo 2) lo2)
(loop (+ i 1) (+ k 4)))))))
(loop 0 0)
bv)))
(define (utf16->string bytevector . rest)
(let* ((n (bytevector-length bytevector))
(begins-with-bom?
(and (null? rest)
(<= 2 n)
(let ((b0 (bytevector-u8-ref bytevector 0))
(b1 (bytevector-u8-ref bytevector 1)))
(or (and (= b0 #xfe) (= b1 #xff) 'big)
(and (= b0 #xff) (= b1 #xfe) 'little)))))
(endianness (cond ((null? rest) (or begins-with-bom? 'big))
((eq? (car rest) 'big) 'big)
((eq? (car rest) 'little) 'little)
(else (endianness-violation
'utf16->string
(car rest)))))
(hi (if (eq? 'big endianness) 0 1))
(lo (- 1 hi))
(replacement-character (integer->char #xfffd)))
(define (result-length)
(define (loop i k)
(if (>= i n)
k
(let ((octet (bytevector-u8-ref bytevector i)))
(cond ((< octet #xd8)
(loop (+ i 2) (+ k 1)))
((< octet #xdc)
(let* ((i2 (+ i 2))
(octet2 (if (< i2 n)
(bytevector-u8-ref bytevector i2)
0)))
(if (<= #xdc octet2 #xdf)
(loop (+ i 4) (+ k 1))
(loop i2 (+ k 1)))))
(else (loop (+ i 2) (+ k 1)))))))
(if begins-with-bom?
(loop (+ hi 2) 0)
(loop hi 0)))
(if (odd? n)
(assertion-violation 'utf16->string
"bytevector has odd length" bytevector))
(let ((s (make-string (result-length))))
(define (loop i k)
(if (< i n)
(let ((hibits (bytevector-u8-ref bytevector (+ i hi)))
(lobits (bytevector-u8-ref bytevector (+ i lo))))
(cond ((< hibits #xd8)
(let ((c (integer->char
(bitwise-ior
(bitwise-arithmetic-shift-left hibits 8)
lobits))))
(string-set! s k c))
(loop (+ i 2) (+ k 1)))
((< hibits #xdc)
(let* ((i2 (+ i hi 2))
(i3 (+ i lo 2))
(octet2 (if (< i2 n)
(bytevector-u8-ref bytevector i2)
0))
(octet3 (if (< i2 n)
(bytevector-u8-ref bytevector i3)
0)))
(if (<= #xdc octet2 #xdf)
(let* ((sv (+ #x10000
(bitwise-arithmetic-shift-left
(bitwise-and
(bitwise-ior
(bitwise-arithmetic-shift-left
hibits 8)
lobits)
#x03ff)
10)
(bitwise-and
(bitwise-ior
(bitwise-arithmetic-shift-left
octet2 8)
octet3)
#x03ff)))
(c (if (<= #x10000 sv #x10ffff)
(integer->char sv)
replacement-character)))
(string-set! s k c)
(loop (+ i 4) (+ k 1)))
(begin (string-set! s k replacement-character)
(loop (+ i 2) (+ k 1))))))
((< hibits #xe0)
(string-set! s k replacement-character)
(loop (+ i 2) (+ k 1)))
(else
(let ((c (integer->char
(bitwise-ior
(bitwise-arithmetic-shift-left hibits 8)
lobits))))
(string-set! s k c))
(loop (+ i 2) (+ k 1)))))))
(if begins-with-bom?
(loop 2 0)
(loop 0 0))
s)))
(define (string->utf32 string . rest)
(let* ((endianness (cond ((null? rest) 'big)
((eq? (car rest) 'big) 'big)
((eq? (car rest) 'little) 'little)
(else (endianness-violation
'string->utf32
(car rest)))))
(n (string-length string))
(result (make-bytevector (* 4 n))))
(do ((i 0 (+ i 1)))
((= i n) result)
(bytevector-u32-set! result
(* 4 i)
(char->integer (string-ref string i))
endianness))))
(define (utf32->string bytevector . rest)
(let* ((n (bytevector-length bytevector))
(begins-with-bom?
(and (null? rest)
(<= 4 n)
(let ((b0 (bytevector-u8-ref bytevector 0))
(b1 (bytevector-u8-ref bytevector 1))
(b2 (bytevector-u8-ref bytevector 2))
(b3 (bytevector-u8-ref bytevector 3)))
(or (and (= b0 0) (= b1 0) (= b2 #xfe) (= b3 #xff)
'big)
(and (= b0 #xff) (= b1 #xfe) (= b2 0) (= b3 0)
'little)))))
(endianness (cond ((null? rest) (or begins-with-bom? 'big))
((eq? (car rest) 'big) 'big)
((eq? (car rest) 'little) 'little)
(else (endianness-violation
'string->utf32
(car rest)))))
(i0 (if begins-with-bom? 4 0))
(result (if (zero? (remainder n 4))
(make-string (quotient (- n i0) 4))
(assertion-violation
'utf32->string
"Bytevector has bad length." bytevector))))
(do ((i i0 (+ i 4))
(j 0 (+ j 1)))
((= i n) result)
(let* ((sv (bytevector-u32-ref bytevector i endianness))
(sv (cond ((< sv #xd800) sv)
((< sv #xe000) #xfffd) ((< sv #x110000) sv)
(else #xfffd))) (c (integer->char sv)))
(string-set! result j c)))))
(define (endianness-violation who what)
(assertion-violation who "bad endianness" what))
)