--- a/libraries/base/base.cabal 2009-12-10 20:23:43.000000000 +0200 +++ b/libraries/base/base.cabal 2010-07-15 15:17:59.003960401 +0300 @@ -161,6 +161,7 @@ Foreign.C, Foreign.C.Error, Foreign.C.String, + Foreign.C.String.Locale, Foreign.C.Types, Foreign.ForeignPtr, Foreign.Marshal, --- a/libraries/base/Foreign/C/String/Locale.hs 1970-01-01 03:00:00.000000000 +0300 +++ b/libraries/base/Foreign/C/String/Locale.hs 2010-07-16 00:07:29.409911799 +0300 @@ -0,0 +1,197 @@ +{- | This module implements Foreign.C.String according to Foreign Function Interface Addendum, i.e. C strings are treated as they are encoded with the current locale. Also this module adds peekFree* functions. Command @peekFree* free s@ frees @s@ with @free@ eventually, but not immediately. a peek* function sends a copy of its argument to peekFree*, so peek* is less effective. -} +{- Apply this patch with "patch -u -p1" in the root directory of GHC. This patch is for GHC and Linux. Only System.Environment is patched to use locale aware functions. + +Conversion is done by GHC.IO.Encoding, the current locale is 'GHC.IO.Encoding.localeEncoding'. + +When implementing, the main obstacle is that we do not know the length of the resulting string. So we use intermediate buffers which are filled and flushed several times during one run of a conversion function. + +For correct working there must be @sizeOf CChar == sizeOf Word8@. + +Behavior of 'GHC.IO.Encoding.Iconv.iconvRecode' is changed, see my comment in the source code of that function. I can not predict whether my change brokes something. + +Patching all functions in "base" to use locale aware functions. Character encoding conversion ("conversion" below) is used also when handling errors of C functions. When a C function returns an error, Foreign.C.Error.errnoToIOError converts @errno@ to an error name. Contents of an error name is in LC_MESSAGES character encoding (see POSIX, string.h, strerror). So when handling errors of C functions, we need conversion. +But conversion functions (you can find them in POSIX, iconv.h) are C functions and themseves may return an error. This leads to a loop in a Haskell module dependency graph and possibly to an infinite loop at run time. The simplest solution is to not retrieve error names for errors of conversion functions. +This suggests reorganization of Haskell modules: +Extract conversion independent code: + Foreign.C.String -> Foreign.C.String.NoLocale + Foreign.C.Error -> Foreign.C.Error.NoLocale +Redirect GHC.IO.Encoding.Iconv to *.NoLocale modules -} +{-# OPTIONS_GHC -XNoImplicitPrelude #-} +module Foreign.C.String.Locale + ( peekFreeCString, peekFreeCStringLen + , peekCString, peekCStringLen + , newCString, newCStringLen + , withCString, withCStringLen + ) where +import System.IO.Unsafe +import GHC.List +import Data.Word +import Data.List (sum) +import Data.Tuple (fst) +import Data.Either +import GHC.Real +import GHC.Num +import GHC.Base +import GHC.IO.Buffer +import GHC.IO.Encoding +import GHC.IO.Handle.Text (unpack) +import Control.Concurrent.MVar +import Foreign.Ptr +import Foreign.ForeignPtr +import Foreign.Storable +import Foreign.Marshal.Alloc (allocaBytes) +import Foreign.Marshal.Array (allocaArray, allocaArray0, mallocArray0) +import Foreign.C.Types +import Foreign.C.String (CString, CStringLen) + +charArraySizePeek = 17 {- length of a 'CharBuffer' for peek* functions -} +charArraySizeHs2C = 13 {- length of a 'CharBuffer' for converting from Haskell to C -} +byteArraySize = 37 {- length of a @Buffer Word8@ for converting from Haskell to C -} + +foreign import ccall unsafe "string.h memcpy" + memcpy :: Ptr Word8 -> Ptr Word8 -> CSize -> IO () +foreign import ccall unsafe "string.h strlen" strlen :: Ptr CChar -> IO CSize + +{- @peekFreeCStringLen free s@ converts @s@ to @Buffer Word8@. Then conversion @Buffer Word8 -> charBuffer :: CharBuffer -> String@. The resulting 'String' is produced lazily. 'MVar' schedules access to it for several threads. -} +peekFreeCStringLen :: (CStringLen -> IO ()) -> CStringLen -> IO String +peekFreeCStringLen free cs@(byteP, len) = case localeEncoding of + TextEncoding {mkTextDecoder = ioCodec} -> do + codec <- ioCodec + let rc restMVar0 = unsafePerformIO $ do + x <- takeMVar restMVar0 + case x of + Left byteBuffer -> allocaBytes (charArraySizePeek * charSize) $ \charP -> do + charFP <- newForeignPtr_ (castPtr charP) + (byteBuffer, charBuffer) <- encode codec byteBuffer (Buffer + { bufRaw = charFP, bufState = WriteBuffer + , bufSize = charArraySizePeek, bufL = 0, bufR = 0}) + let k rest = case charBuffer of + Buffer {bufRaw = bufRaw0, bufL = bufL0, bufR = bufR0} -> do + x <- unpack bufRaw0 bufL0 bufR0 rest + putMVar restMVar0 (Right x) + return x + case isEmptyBuffer byteBuffer of + True -> do + free cs + k "" + False -> do + restMVar1 <- newMVar (Left byteBuffer) + k (rc restMVar1) + Right x -> return x + byteFP <- newForeignPtr_ (castPtr byteP) + restMVar <- newMVar (Left (Buffer + { bufRaw = byteFP, bufState = ReadBuffer + , bufSize = len, bufL = 0, bufR = len})) + return (rc restMVar) +{- version without MVar +peekFreeCStringLen free cs@(byteP, len) = case localeEncoding of + TextEncoding {mkTextDecoder = ioCodec} -> do + codec <- ioCodec + charBuffer <- newCharBuffer 100 WriteBuffer + byteFP <- newForeignPtr_ (castPtr byteP) + let + byteBuffer = Buffer {bufRaw = byteFP, bufState = ReadBuffer + , bufSize = len, bufL = 0, bufR = len} + rc byteBuffer = case isEmptyBuffer byteBuffer of + True -> do + free cs + return "" + False -> do + (byteBuffer, charBuffer) <- encode codec byteBuffer charBuffer + rest <- unsafeInterleaveIO (rc byteBuffer) + case charBuffer of + Buffer {bufRaw = bufRaw0, bufL = bufL0, bufR = bufR0} -> + unpack bufRaw0 bufL0 bufR0 rest + in rc byteBuffer +-} + +peekFreeCString :: (CString -> IO ()) -> CString -> IO String +peekFreeCString free cs = do + len <- strlen cs + peekFreeCStringLen (\(cs, _) -> free cs) (cs, fromIntegral len) + +peekCStringLen :: CStringLen -> IO String +peekCStringLen (p0, len) = do + p1 <- mallocForeignPtrBytes len + withForeignPtr p1 $ \p1 -> do + memcpy p1 (castPtr p0) (fromIntegral len) + peekFreeCStringLen (\_ -> return ()) (castPtr p1, len) + +peekCString :: CString -> IO String +peekCString p0 = do + len <- strlen p0 {- length including the terminating byte -} + p1 <- mallocForeignPtrBytes (fromIntegral len) + withForeignPtr p1 $ \p1 -> do + memcpy p1 (castPtr p0) len + peekFreeCStringLen (\_ -> return ()) (castPtr p1, fromIntegral len) + +withAllocaByteBuffer :: (Buffer Word8 -> IO a) -> IO a +withAllocaByteBuffer k = allocaBytes byteArraySize $ \byteP -> do + byteFP <- newForeignPtr_ (castPtr byteP) + k (Buffer {bufRaw = byteFP, bufState = WriteBuffer + , bufSize = byteArraySize, bufL = 0, bufR = 0}) + +{- Conversion @String -> charBuffer :: CharBuffer -> bs :: [Buffer Word8]@. @bs@ is a sequence of parts of the resulting C string. Sequence is in reverse order. This function is parametrized by allocation function and subcomputation. -} +newCStringLenAlloc :: (Int -> forall a. (CString -> IO a) -> IO a) -> (CStringLen -> IO a) -> String -> IO a +newCStringLenAlloc alloc subio xs = allocaArray charArraySizeHs2C $ \charP -> case localeEncoding of + TextEncoding {mkTextEncoder = ioCodec} -> do + codec <- ioCodec + charFP <- newForeignPtr_ (castPtr charP) + let stringToCharBuffer xs = let + mkCharBuffer size = Buffer + { bufRaw = charFP, bufState = ReadBuffer + , bufSize = charArraySizeHs2C, bufL = 0, bufR = size} + rc size ix xs = case size >= charArraySizeHs2C of + True -> return (mkCharBuffer charArraySizeHs2C, xs) + False -> case xs of + [] -> return (mkCharBuffer size, []) + x : xs -> do + ix <- writeCharBufPtr charP ix x + rc (size+1) ix xs + in rc 0 0 xs + let rc0 k xs charBuffer byteBuffer0 bs = let + k1 (charBuffer, xs) = do + let filledOld = bufferElems byteBuffer0 + (charBuffer, byteBuffer0) <- encode codec charBuffer byteBuffer0 + let rc1 = rc0 k xs charBuffer + case filledOld == bufferElems byteBuffer0 || isFullBuffer byteBuffer0 of + True -> withAllocaByteBuffer $ \byteBuffer1 -> + rc1 byteBuffer1 (byteBuffer0 : bs) + False -> rc1 byteBuffer0 bs + in case isEmptyBuffer charBuffer of + False -> k1 (charBuffer, xs) + True -> case xs of + [] -> k (byteBuffer0 : bs) + _ -> stringToCharBuffer xs >>= k1 + let k0 bs = let + lenAll = sum (map bufR bs) + in alloc lenAll $ \byteAll -> let + rc1 bs to0 = case bs of + [] -> subio (byteAll, lenAll) + b : bs -> withForeignPtr (bufRaw b) $ \from -> do + let lenBuf = bufR b + let to1 = plusPtr to0 (- lenBuf) + _ <- memcpy (castPtr to1) from (fromIntegral lenBuf) + rc1 bs to1 + in rc1 bs (plusPtr byteAll lenAll) + let charBuffer = Buffer + { bufRaw = charFP, bufState = ReadBuffer + , bufSize = charArraySizeHs2C, bufL = 0, bufR = 0} + withAllocaByteBuffer $ \byteBuffer -> rc0 k0 xs charBuffer byteBuffer [] + +newCStringLen :: String -> IO CStringLen +newCStringLen = newCStringLenAlloc + (\size k -> mallocArray0 size >>= k) + return + +newCString :: String -> IO CString +newCString s = fmap fst (newCStringLen s) + +withCStringLen :: String -> (CStringLen -> IO a) -> IO a +withCStringLen xs k = newCStringLenAlloc + (\size k -> mallocArray0 size >>= k) + k xs + +withCString :: String -> (CString -> IO a) -> IO a +withCString xs k = withCStringLen xs (\(p, _) -> k p) + --- a/libraries/base/GHC/IO/Encoding/Iconv.hs 2009-12-10 20:23:43.000000000 +0200 +++ b/libraries/base/GHC/IO/Encoding/Iconv.hs 2010-07-15 21:53:18.794961321 +0300 @@ -52,7 +52,7 @@ iconv_trace s = puts s puts :: String -> IO () -puts s = do withCStringLen (s++"\n") $ \(p, len) -> +puts s = do withCAStringLen (s++"\n") $ \(p, len) -> c_write 1 (castPtr p) (fromIntegral len) return () @@ -103,7 +103,7 @@ #if HAVE_LANGINFO_H cstr <- c_localeEncoding -- use nl_langinfo(CODESET) to get the encoding -- if we have it - r <- peekCString cstr + r <- peekCAString cstr mkTextEncoding r #else mkTextEncoding "" -- GNU iconv accepts "" to mean the -- locale encoding. @@ -149,8 +149,8 @@ -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b)) -> IO (BufferCodec a b ()) newIConv from to fn = - withCString from $ \ from_str -> - withCString to $ \ to_str -> do + withCAString from $ \ from_str -> + withCAString to $ \ to_str -> do iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt return BufferCodec{ @@ -204,10 +204,11 @@ errno <- getErrno case errno of e | e == eINVAL - || (e == e2BIG || e == eILSEQ) && new_inleft' /= (iw-ir) -> do + || (e == e2BIG) || (e == eILSEQ) && new_inleft' /= (iw-ir) -> do iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing)) -- Output overflow is relatively harmless, unless -- we made no progress at all. + {- e2BIG with "no progress" is not an error. Suppose that 'output' has NO free bytes. Suppose that the first character in 'input' will be converted to NI bytes, NI>NO. A program think that 'output' is not full, but 'output' may not be filled more. The only way to recognize this situation is to run 'iconv' and check that we made no progress. because only 'iconv' knows values NI and NO. --beroal -} -- -- Similarly, we ignore EILSEQ unless we converted no -- characters. Sometimes iconv reports EILSEQ for a --- a/libraries/base/GHC/IO/Handle/Text.hs 2009-12-10 20:23:43.000000000 +0200 +++ b/libraries/base/GHC/IO/Handle/Text.hs 2010-07-15 15:17:59.028960361 +0300 @@ -24,6 +24,7 @@ commitBuffer', -- hack, see below hGetBuf, hGetBufNonBlocking, hPutBuf, hPutBufNonBlocking, memcpy, + unpack, ) where import GHC.IO --- a/libraries/base/System/Environment.hs 2009-12-10 20:23:42.000000000 +0200 +++ b/libraries/base/System/Environment.hs 2010-07-15 22:03:24.696960359 +0300 @@ -31,7 +31,9 @@ #ifdef __GLASGOW_HASKELL__ import Data.List import Foreign -import Foreign.C +import Foreign.C.Types +import Foreign.C.String (CString, CStringLen) +import Foreign.C.String.Locale (peekFreeCString, withCString, newCString) import Control.Exception.Base ( bracket ) import Control.Monad -- import GHC.IO @@ -64,7 +66,7 @@ getProgArgv p_argc p_argv p <- fromIntegral `liftM` peek p_argc argv <- peek p_argv - peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString + peekArray (p - 1) (advancePtr argv 1) >>= mapM (peekFreeCString (\_ -> return ())) foreign import ccall unsafe "getProgArgv" @@ -90,7 +92,7 @@ unpackProgName :: Ptr (Ptr CChar) -> IO String -- argv[0] unpackProgName argv = do - s <- peekElemOff argv 0 >>= peekCString + s <- peekElemOff argv 0 >>= peekFreeCString (\_ -> return ()) return (basename s) where basename :: String -> String @@ -122,7 +124,7 @@ withCString name $ \s -> do litstring <- c_getenv s if litstring /= nullPtr - then peekCString litstring + then peekFreeCString (\_ -> return ()) litstring else ioException (IOError Nothing NoSuchThing "getEnv" "no environment variable" Nothing (Just name)) @@ -185,7 +187,7 @@ pBlock <- getEnvBlock if pBlock == nullPtr then return [] else do - stuff <- peekArray0 nullPtr pBlock >>= mapM peekCString + stuff <- peekArray0 nullPtr pBlock >>= mapM (peekFreeCString (\_ -> return ())) return (map divvy stuff) where divvy str =