darcsden :: kowey -> uhexdump -> blob

UTF-8 aware hex dump utility

root / uhexdump.hs

module Main where

-- TODO
-- ----
-- o figure out how to use utf8-string for the decoding with
--   errors [this lets us get rid of the UTF8 module]
-- o colourise badly encoded characters (switch to something
--   easier to type than guillmets)
-- o fancy flags

import Control.Monad (forM_)
import Data.Bits
import qualified Data.ByteString as B
import Data.Char (isPrint)
import Data.List (intersperse, intercalate, inits)
import Data.List.Split (split, keepDelimsR, whenElt)
import Data.Word (Word8)
import System.Environment (getArgs)
import System.IO (stdin, hClose, IOMode(ReadMode), openBinaryFile)
import qualified Numeric as N

import UTF8 (decodeOne, Error(..))

main :: IO ()
main =
 do args <- getArgs
    hs <- case args of
           [] -> return [stdin]
           _  -> mapM (\f -> openBinaryFile f ReadMode) args
    forM_ hs $ \h ->
     do bs <- B.unpack `fmap` B.hGetContents h
        putStr . unlines
               $ map (dump h_size)
               $ concatMap (clump h_size)
               $ split (keepDelimsR (whenElt isNewline))
               $ decode bs
        hClose h
 where
  clump sz = clumpBy (length . showHex . decoratedBytes) sz 1
  h_size = 40 -- keep in mind that each byte takes 2 chars a
              -- space; and that each code point takes at
              -- most 4 bytes.  So we need (3 * 4 * n) just
              -- to display the hex part of the window

isNewline :: HexChar -> Bool
isNewline (HexChar c _) = c == '\n'
isNewline _ = False

dump :: Int -> [HexChar] -> String
dump hex_sz cs =
   hexPart ++ padding ++ charPart
 where
   hexPart  = intercalate " " (map showHex dbs)
   dbs      = map decoratedBytes cs
   charPart = concatMap show cs
   padding  = replicate (hex_sz - length hexPart) ' '


-- | break a list of items into sublists of length < the clump
--   size, taking into consideration that each item in the clump
--   will have a sep-size padding interspersed
--
--   any item whose length is greater than the clump size
--   is put into a clump by itself
--
--   given a length function
--   @clumpBy (length.show) 8 ["hello", "this", "is", "a", "list"]@
clumpBy :: (a -> Int) -> Int -> Int -> [a] -> [[a]]
clumpBy f l sep items = iter [] items
 where
  iter acc [] = reverse acc
  iter acc cs =
   case break toobig (drop 1 $ inits cs) of
        ([],_)    -> next 1           -- first too big
        (_,[])    -> iter (cs:acc) [] -- none too big
        (_,(x:_)) -> next (length x - 1)
   where next n = iter (take n cs : acc) (drop n cs)
  toobig x = (sum . intersperse sep . map f) x > l

-- ----------------------------------------------------------------------
--
-- ----------------------------------------------------------------------

data ByteSequence = ByteSequence [Word8] Char -- bytes and sep

decoratedBytes :: HexChar -> ByteSequence
decoratedBytes c = ByteSequence bytes seps
  where
   bytes = getBytes c
   seps  = case c of
     HexChar  _ _ -> '-'
     HexError _ _ -> '#'

-- -------------------------------------------------------------------
--
-- -------------------------------------------------------------------

class ShowHex a where
 showHex :: a -> String

instance ShowHex Int where
 showHex x = N.showHex x ""

instance ShowHex Word8 where
 showHex w = showHex big ++ showHex small
  where
    small, big :: Int
    small = fromIntegral $ w .&. 0x0f
    big   = fromIntegral $ shiftR w 4

instance ShowHex HexChar where
 showHex h = concatMap showHex (getBytes h)

instance ShowHex ByteSequence where
 showHex (ByteSequence bs s) = intercalate [s] (map showHex bs)

-- -------------------------------------------------------------------
--
-- -------------------------------------------------------------------

data HexChar =
    HexChar  Char        [Word8]
  | HexError (Error,Int) [Word8]

instance Show HexChar where
 show (HexChar '\r' _) = noshow
 show (HexChar '\n' _) = noshow
 show (HexChar '\t' _) = noshow
 show (HexChar c _) | isPrint c  = [c]
 show (HexChar _ _) = noshow
 show (HexError _ _) = noshow

noshow :: String
noshow = "."

getBytes :: HexChar -> [Word8]
getBytes (HexChar _ ws) = ws
getBytes (HexError _ ws) = ws

decode :: [Word8] -> [HexChar]
decode bytes = iter 0 [] bytes
 where
 iter :: Int -> [HexChar] -> [Word8] -> [HexChar]
 iter _ cs [] = reverse cs
 iter idx cs bs
  = case decodeOne bs of
    (Left e, n, rest)  -> let hc = HexError (e,idx) (take n bs)
                          in iter (idx+n) (hc:cs) rest
    (Right c, n, rest) -> let hc = HexChar c (take n bs)
                          in iter (idx+n) (hc:cs) rest