UTF-8 aware hex dump utility
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | module Main where
-- TODO
-- ----
-- o figure out how to use utf8-string for the decoding with
-- errors [this lets us get rid of the UTF8 module]
-- o colourise badly encoded characters (switch to something
-- easier to type than guillmets)
-- o fancy flags
import Control.Monad (forM_)
import Data.Bits
import qualified Data.ByteString as B
import Data.Char (isPrint)
import Data.List (intersperse, intercalate, inits)
import Data.List.Split (split, keepDelimsR, whenElt)
import Data.Word (Word8)
import System.Environment (getArgs)
import System.IO (stdin, hClose, IOMode(ReadMode), openBinaryFile)
import qualified Numeric as N
import UTF8 (decodeOne, Error(..))
main :: IO ()
main =
do args <- getArgs
hs <- case args of
[] -> return [stdin]
_ -> mapM (\f -> openBinaryFile f ReadMode) args
forM_ hs $ \h ->
do bs <- B.unpack `fmap` B.hGetContents h
putStr . unlines
$ map (dump h_size)
$ concatMap (clump h_size)
$ split (keepDelimsR (whenElt isNewline))
$ decode bs
hClose h
where
clump sz = clumpBy (length . showHex . decoratedBytes) sz 1
h_size = 40 -- keep in mind that each byte takes 2 chars a
-- space; and that each code point takes at
-- most 4 bytes. So we need (3 * 4 * n) just
-- to display the hex part of the window
isNewline :: HexChar -> Bool
isNewline (HexChar c _) = c == '\n'
isNewline _ = False
dump :: Int -> [HexChar] -> String
dump hex_sz cs =
hexPart ++ padding ++ charPart
where
hexPart = intercalate " " (map showHex dbs)
dbs = map decoratedBytes cs
charPart = concatMap show cs
padding = replicate (hex_sz - length hexPart) ' '
-- | break a list of items into sublists of length < the clump
-- size, taking into consideration that each item in the clump
-- will have a sep-size padding interspersed
--
-- any item whose length is greater than the clump size
-- is put into a clump by itself
--
-- given a length function
-- @clumpBy (length.show) 8 ["hello", "this", "is", "a", "list"]@
clumpBy :: (a -> Int) -> Int -> Int -> [a] -> [[a]]
clumpBy f l sep items = iter [] items
where
iter acc [] = reverse acc
iter acc cs =
case break toobig (drop 1 $ inits cs) of
([],_) -> next 1 -- first too big
(_,[]) -> iter (cs:acc) [] -- none too big
(_,(x:_)) -> next (length x - 1)
where next n = iter (take n cs : acc) (drop n cs)
toobig x = (sum . intersperse sep . map f) x > l
-- ----------------------------------------------------------------------
--
-- ----------------------------------------------------------------------
data ByteSequence = ByteSequence [Word8] Char -- bytes and sep
decoratedBytes :: HexChar -> ByteSequence
decoratedBytes c = ByteSequence bytes seps
where
bytes = getBytes c
seps = case c of
HexChar _ _ -> '-'
HexError _ _ -> '#'
-- -------------------------------------------------------------------
--
-- -------------------------------------------------------------------
class ShowHex a where
showHex :: a -> String
instance ShowHex Int where
showHex x = N.showHex x ""
instance ShowHex Word8 where
showHex w = showHex big ++ showHex small
where
small, big :: Int
small = fromIntegral $ w .&. 0x0f
big = fromIntegral $ shiftR w 4
instance ShowHex HexChar where
showHex h = concatMap showHex (getBytes h)
instance ShowHex ByteSequence where
showHex (ByteSequence bs s) = intercalate [s] (map showHex bs)
-- -------------------------------------------------------------------
--
-- -------------------------------------------------------------------
data HexChar =
HexChar Char [Word8]
| HexError (Error,Int) [Word8]
instance Show HexChar where
show (HexChar '\r' _) = noshow
show (HexChar '\n' _) = noshow
show (HexChar '\t' _) = noshow
show (HexChar c _) | isPrint c = [c]
show (HexChar _ _) = noshow
show (HexError _ _) = noshow
noshow :: String
noshow = "."
getBytes :: HexChar -> [Word8]
getBytes (HexChar _ ws) = ws
getBytes (HexError _ ws) = ws
decode :: [Word8] -> [HexChar]
decode bytes = iter 0 [] bytes
where
iter :: Int -> [HexChar] -> [Word8] -> [HexChar]
iter _ cs [] = reverse cs
iter idx cs bs
= case decodeOne bs of
(Left e, n, rest) -> let hc = HexError (e,idx) (take n bs)
in iter (idx+n) (hc:cs) rest
(Right c, n, rest) -> let hc = HexChar c (take n bs)
in iter (idx+n) (hc:cs) rest
|