wok diff html2text/stuff/patch-utf8-html2text-1.3.2a.diff @ rev 22711

exempi: add bin/exempi
author: Pascal Bellard <pascal.bellard@slitaz.org>
date: Wed Jan 22 11:18:51 2020 +0100 (2020-01-22)
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/html2text/stuff/patch-utf8-html2text-1.3.2a.diff	Wed Jan 22 11:18:51 2020 +0100
     1.3 @@ -0,0 +1,706 @@
     1.4 +diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C
     1.5 +--- html2text-1.3.2a/Area.C	2003-11-23 12:05:29.000000000 +0100
     1.6 ++++ html2text-1.3.2a-patched/Area.C	2005-05-13 22:19:59.862137688 +0200
     1.7 +@@ -36,10 +36,13 @@
     1.8 + #include <iostream>
     1.9 + 
    1.10 + #include "Area.h"
    1.11 ++#include "html.h"
    1.12 + #include "string.h"
    1.13 + 
    1.14 + #define LATIN1_nbsp 160
    1.15 + 
    1.16 ++extern int use_encoding;
    1.17 ++
    1.18 + /* ------------------------------------------------------------------------- */
    1.19 + 
    1.20 + #define malloc_array(type, size)\
    1.21 +@@ -81,6 +84,27 @@
    1.22 + 
    1.23 + /* ------------------------------------------------------------------------- */
    1.24 + 
    1.25 ++/*           utf_length() and utf_width()       
    1.26 ++ *
    1.27 ++ *     Very simplified algorithm of calculating length of UTF-8
    1.28 ++ *   string. No check for errors. Counting only ASCII bytes and
    1.29 ++ *   leading bytes of UTF-8 multibyte sequences. All bytes like
    1.30 ++ *   10xxxxxx are dropped. If USE_UTF8 is false then returns
    1.31 ++ *   usual length.               --YS
    1.32 ++ */
    1.33 ++
    1.34 ++unsigned int
    1.35 ++Line::utf_length(size_type f, size_type t) const
    1.36 ++{
    1.37 ++  size_type m = (t < length_ ? t : length_);
    1.38 ++  size_type r = m - f;
    1.39 ++  if(USE_UTF8) {
    1.40 ++      for (int i = f; i < m; i++)
    1.41 ++        if((cells_[i].character & 0xc0) == 0x80) r--;
    1.42 ++  }
    1.43 ++  return r;
    1.44 ++}
    1.45 ++
    1.46 + void
    1.47 + Line::resize(size_type l)
    1.48 + {
    1.49 +@@ -236,6 +260,23 @@
    1.50 +   return *this;
    1.51 + }
    1.52 + 
    1.53 ++unsigned int
    1.54 ++Area::utf_width()
    1.55 ++{
    1.56 ++  size_type r = width_;
    1.57 ++  if(USE_UTF8) { r = 0;
    1.58 ++    for (size_type yy = 0; yy < height_; yy++) {
    1.59 ++      size_type r1 = 0;
    1.60 ++      for (int i = width_ - 1; i >= 0; i--) {
    1.61 ++        if(!r1 && isspace(cells_[yy][i].character)) continue;
    1.62 ++        if((cells_[yy][i].character & 0xc0) != 0x80) r1++;
    1.63 ++      }
    1.64 ++      if(r < r1) r = r1;
    1.65 ++    }
    1.66 ++  }
    1.67 ++  return r;
    1.68 ++}
    1.69 ++
    1.70 + void
    1.71 + Area::resize(size_type w, size_type h)
    1.72 + {
    1.73 +@@ -439,7 +480,7 @@
    1.74 +       char c = p->character;
    1.75 +       char a = p->attribute;
    1.76 + 
    1.77 +-      if (c == (char) LATIN1_nbsp) c = ' ';
    1.78 ++      if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';
    1.79 + 
    1.80 +       if (a == Cell::NONE) {
    1.81 +         os << c;
    1.82 +Nur in html2text-1.3.2a-patched/: Area.C.orig.
    1.83 +diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h
    1.84 +--- html2text-1.3.2a/Area.h	2003-11-23 12:05:29.000000000 +0100
    1.85 ++++ html2text-1.3.2a-patched/Area.h	2005-05-13 22:19:59.863137536 +0200
    1.86 +@@ -81,6 +81,8 @@
    1.87 +   Cell       &operator[](size_type x)       { return cells_[x]; }
    1.88 +   const Cell *cells() const { return cells_; }
    1.89 + 
    1.90 ++  unsigned int utf_length(size_type f, size_type t) const;
    1.91 ++
    1.92 +   void resize(size_type l);
    1.93 +   void enlarge(size_type l) { if (l > length_) resize(l); }
    1.94 + 
    1.95 +@@ -134,6 +136,8 @@
    1.96 +   Cell       *operator[](size_type y)       { return cells_[y]; }
    1.97 +   const Area &operator>>=(size_type rs);
    1.98 + 
    1.99 ++  unsigned int utf_width();
   1.100 ++
   1.101 +   void resize(size_type w, size_type h);
   1.102 +   void enlarge(size_type w, size_type h);
   1.103 + 
   1.104 +Nur in html2text-1.3.2a-patched/: Area.h.orig.
   1.105 +diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C
   1.106 +--- html2text-1.3.2a/format.C	2003-11-23 12:05:29.000000000 +0100
   1.107 ++++ html2text-1.3.2a-patched/format.C	2005-05-13 22:19:59.865137232 +0200
   1.108 +@@ -1210,6 +1210,7 @@
   1.109 +     }
   1.110 + 
   1.111 +     Line::size_type to = from + 1;
   1.112 ++    int to_from;
   1.113 + 
   1.114 +     Line::size_type lbp = (Line::size_type) -1; // "Last break position".
   1.115 + 
   1.116 +@@ -1238,18 +1239,20 @@
   1.117 +         to++;
   1.118 +       }
   1.119 + 
   1.120 +-      if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; }
   1.121 ++      if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) 
   1.122 ++                    { to = lbp; break; }
   1.123 +     }
   1.124 + 
   1.125 ++    to_from = line.utf_length(from,to);
   1.126 +     /*
   1.127 +      * Copy the "from...to" range from the "line" to the bottom of the "res"
   1.128 +      * Area.
   1.129 +      */
   1.130 +     Area::size_type x = 0;
   1.131 +     Area::size_type len = to - from;
   1.132 +-    if (halign == Area::LEFT || len >= w) { ;                   } else
   1.133 +-    if (halign == Area::CENTER)           { x += (w - len) / 2; } else
   1.134 +-    if (halign == Area::RIGHT)            { x += w - len;       }
   1.135 ++    if (halign == Area::LEFT || to_from >= w) { ;                   } else
   1.136 ++    if (halign == Area::CENTER)           { x += (w - to_from) / 2; } else
   1.137 ++    if (halign == Area::RIGHT)            { x += w - to_from;       }
   1.138 +     res->insert(line.cells() + from, len, x, res->height());
   1.139 + 
   1.140 +     /*
   1.141 +Nur in html2text-1.3.2a-patched/: format.C.orig.
   1.142 +diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C
   1.143 +--- html2text-1.3.2a/html2text.C	2003-11-23 12:05:29.000000000 +0100
   1.144 ++++ html2text-1.3.2a-patched/html2text.C	2005-05-13 22:19:59.868136776 +0200
   1.145 +@@ -148,9 +148,10 @@
   1.146 +   -o <file>      Redirect output into <file>\n\
   1.147 +   -nobs          Do not use backspaces for boldface and underlining\n\
   1.148 +   -ascii         Use plain ASCII for output instead of ISO-8859-1\n\
   1.149 ++  -utf8          Assume both terminal and input stream are in UTF-8 mode\n\
   1.150 + ";
   1.151 + 
   1.152 +-int use_iso8859 = 1;
   1.153 ++int use_encoding = ISO8859;
   1.154 + 
   1.155 + int
   1.156 + main(int argc, char **argv)
   1.157 +@@ -199,7 +200,8 @@
   1.158 +     if (!strcmp(arg, "-width"        )) { width = atoi(argv[++i]);       } else
   1.159 +     if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];  } else
   1.160 +     if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;        } else
   1.161 +-    if (!strcmp(arg, "-ascii"        )) { use_iso8859 = false;           } else
   1.162 ++    if (!strcmp(arg, "-ascii"        )) { use_encoding = ASCII;          } else
   1.163 ++    if (!strcmp(arg, "-utf8"         )) { use_encoding = UTF8;           } else
   1.164 +     {
   1.165 +       std::cerr
   1.166 + 	<< "Unrecognized command line option \""
   1.167 +Nur in html2text-1.3.2a-patched/: html2text.C.orig.
   1.168 +diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h
   1.169 +--- html2text-1.3.2a/html.h	2001-10-04 22:03:54.000000000 +0200
   1.170 ++++ html2text-1.3.2a-patched/html.h	2005-05-13 22:19:59.866137080 +0200
   1.171 +@@ -61,6 +61,11 @@
   1.172 + 
   1.173 + /* ------------------------------------------------------------------------- */
   1.174 + 
   1.175 ++enum {ASCII, ISO8859, UTF8};
   1.176 ++#define USE_ISO8859 (use_encoding == ISO8859)
   1.177 ++#define USE_ASCII (use_encoding == ASCII)
   1.178 ++#define USE_UTF8 (use_encoding == UTF8)
   1.179 ++
   1.180 + #define LATIN1_nbsp   160
   1.181 + #define LATIN1_iexcl  161
   1.182 + #define LATIN1_cent   162
   1.183 +diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C
   1.184 +--- html2text-1.3.2a/sgml.C	2003-11-23 12:09:11.000000000 +0100
   1.185 ++++ html2text-1.3.2a-patched/sgml.C	2005-05-13 22:19:59.870136472 +0200
   1.186 +@@ -62,261 +62,280 @@
   1.187 +   char name[8];
   1.188 +   int  iso8859code;
   1.189 +   char *asciistr;
   1.190 ++  unsigned long unicode;
   1.191 + } entities[] = {
   1.192 +-  { "AElig",   LATIN1_AElig,  "AE"         },
   1.193 +-  { "AMP",     0,             "&"          },
   1.194 +-  { "Aacute",  LATIN1_Aacute, "A'"         },
   1.195 +-  { "Acirc",   LATIN1_Acirc,  "A^"         },
   1.196 +-  { "Agrave",  LATIN1_Agrave, "A`"         },
   1.197 +-  { "Alpha",   0,             "A"          },
   1.198 +-  { "Aring",   LATIN1_Aring,  "AA"         },
   1.199 +-  { "Atilde",  LATIN1_Atilde, "A~"         },
   1.200 +-  { "Auml",    LATIN1_Auml,   "A\""        },
   1.201 +-  { "Beta",    0,             "B"          },
   1.202 +-  { "Ccedil",  LATIN1_Ccedil, "C,"         },
   1.203 +-  { "Chi",     0,             "H"          },
   1.204 +-  { "Dagger",  0,             "++"         },
   1.205 +-  { "Delta",   0,             "D"          },
   1.206 +-  { "ETH",     LATIN1_ETH,    "D-"         },
   1.207 +-  { "Eacute",  LATIN1_Eacute, "E'"         },
   1.208 +-  { "Ecirc",   LATIN1_Ecirc,  "E^"         },
   1.209 +-  { "Egrave",  LATIN1_Egrave, "E`"         },
   1.210 +-  { "Epsilon", 0,             "E"          },
   1.211 +-  { "Eta",     0,             "E"          },
   1.212 +-  { "Euml",    LATIN1_Euml,   "E\""        },
   1.213 +-  { "GT",      0,             ">"          },
   1.214 +-  { "Gamma",   0,             "G"          },
   1.215 +-  { "Iacute",  LATIN1_Iacute, "I'"         },
   1.216 +-  { "Icirc",   LATIN1_Icirc,  "I^"         },
   1.217 +-  { "Igrave",  LATIN1_Igrave, "I`"         },
   1.218 +-  { "Iota",    0,             "I"          },
   1.219 +-  { "Iuml",    LATIN1_Iuml,   "I\""        },
   1.220 +-  { "Kappa",   0,             "K"          },
   1.221 +-  { "LT",      0,             "<"          },
   1.222 +-  { "Lambda",  0,             "L"          },
   1.223 +-  { "Mu",      0,             "M"          },
   1.224 +-  { "Ntilde",  LATIN1_Ntilde, "N~"         },
   1.225 +-  { "Nu",      0,             "N"          },
   1.226 +-  { "OElig",   0,             "OE"         },
   1.227 +-  { "Oacute",  LATIN1_Oacute, "O'"         },
   1.228 +-  { "Ocirc",   LATIN1_Ocirc,  "O^"         },
   1.229 +-  { "Ograve",  LATIN1_Ograve, "O`"         },
   1.230 +-  { "Omega",   0,             "O"          },
   1.231 +-  { "Omicron", 0,             "O"          },
   1.232 +-  { "Oslash",  LATIN1_Oslash, "O/"         },
   1.233 +-  { "Otilde",  LATIN1_Otilde, "O~"         },
   1.234 +-  { "Ouml",    LATIN1_Ouml,   "O\""        },
   1.235 +-  { "Phi",     0,             "F"          },
   1.236 +-  { "Pi",      0,             "P"          },
   1.237 +-  { "Prime",   0,             "''"         },
   1.238 +-  { "Psi",     0,             "PS"         },
   1.239 +-  { "QUOT",    0,             "\""         },
   1.240 +-  { "Rho",     0,             "R"          },
   1.241 +-  { "Scaron",  0,             "S"          },
   1.242 +-  { "Sigma",   0,             "S"          },
   1.243 +-  { "THORN",   LATIN1_THORN,  "TH"         },
   1.244 +-  { "Tau",     0,             "T"          },
   1.245 +-  { "Theta",   0,             "TH"         },
   1.246 +-  { "Uacute",  LATIN1_Uacute, "U'"         },
   1.247 +-  { "Ucirc",   LATIN1_Ucirc,  "U^"         },
   1.248 +-  { "Ugrave",  LATIN1_Ugrave, "U`"         },
   1.249 +-  { "Upsilon", 0,             "U"          },
   1.250 +-  { "Uuml",    LATIN1_Uuml,   "U\""        },
   1.251 +-  { "Xi",      0,             "X"          },
   1.252 +-  { "Yacute",  LATIN1_Yacute, "Y'"         },
   1.253 +-  { "Yuml",    0,             "Y\""        },
   1.254 +-  { "Zeta",    0,             "Z"          },
   1.255 +-  { "aacute",  LATIN1_aacute, "a'"         },
   1.256 +-  { "acirc",   LATIN1_acirc,  "a^"         },
   1.257 +-  { "acute",   LATIN1_acute,  "'"          },
   1.258 +-  { "aelig",   LATIN1_aelig,  "ae"         },
   1.259 +-  { "agrave",  LATIN1_agrave, "a`"         },
   1.260 ++  { "AElig",   LATIN1_AElig,  "AE",  0x00c6},
   1.261 ++  { "AMP",     0,             "&",   0x0026},
   1.262 ++  { "Aacute",  LATIN1_Aacute, "A'",  0x00c1},
   1.263 ++  { "Acirc",   LATIN1_Acirc,  "A^",  0x00c2},
   1.264 ++  { "Agrave",  LATIN1_Agrave, "A`",  0x00c0},
   1.265 ++  { "Alpha",   0,             "A",   0x0391},
   1.266 ++  { "Aring",   LATIN1_Aring,  "AA",  0x00c5},
   1.267 ++  { "Atilde",  LATIN1_Atilde, "A~",  0x00c3},
   1.268 ++  { "Auml",    LATIN1_Auml,   "A\"", 0x00c4},
   1.269 ++  { "Beta",    0,             "B",   0x0392},
   1.270 ++  { "Ccedil",  LATIN1_Ccedil, "C,",  0x00c7},
   1.271 ++  { "Chi",     0,             "H",   0x03a7},
   1.272 ++  { "Dagger",  0,             "++",  0x2020},
   1.273 ++  { "Delta",   0,             "D",   0x0394},
   1.274 ++  { "ETH",     LATIN1_ETH,    "D-",  0x00d0},
   1.275 ++  { "Eacute",  LATIN1_Eacute, "E'",  0x00c9},
   1.276 ++  { "Ecirc",   LATIN1_Ecirc,  "E^",  0x00ca},
   1.277 ++  { "Egrave",  LATIN1_Egrave, "E`",  0x00c8},
   1.278 ++  { "Epsilon", 0,             "E",   0x0395},
   1.279 ++  { "Eta",     0,             "E",   0x0397},
   1.280 ++  { "Euml",    LATIN1_Euml,   "E\"", 0x00cb},
   1.281 ++  { "GT",      0,             ">",   0x003e},
   1.282 ++  { "Gamma",   0,             "G",   0x0393},
   1.283 ++  { "Iacute",  LATIN1_Iacute, "I'",  0x00cd},
   1.284 ++  { "Icirc",   LATIN1_Icirc,  "I^",  0x00ce},
   1.285 ++  { "Igrave",  LATIN1_Igrave, "I`",  0x00cc},
   1.286 ++  { "Iota",    0,             "I",   0x0399},
   1.287 ++  { "Iuml",    LATIN1_Iuml,   "I\"", 0x00cf},
   1.288 ++  { "Kappa",   0,             "K",   0x039a},
   1.289 ++  { "LT",      0,             "<",   0x003c},
   1.290 ++  { "Lambda",  0,             "L",   0x039b},
   1.291 ++  { "Mu",      0,             "M",   0x039c},
   1.292 ++  { "Ntilde",  LATIN1_Ntilde, "N~",  0x00d1},
   1.293 ++  { "Nu",      0,             "N",   0x039d},
   1.294 ++  { "OElig",   0,             "OE",  0x0152},
   1.295 ++  { "Oacute",  LATIN1_Oacute, "O'",  0x00d3},
   1.296 ++  { "Ocirc",   LATIN1_Ocirc,  "O^",  0x00d4},
   1.297 ++  { "Ograve",  LATIN1_Ograve, "O`",  0x00d2},
   1.298 ++  { "Omega",   0,             "O",   0x03a9},
   1.299 ++  { "Omicron", 0,             "O",   0x039f},
   1.300 ++  { "Oslash",  LATIN1_Oslash, "O/",  0x00d8},
   1.301 ++  { "Otilde",  LATIN1_Otilde, "O~",  0x00d5},
   1.302 ++  { "Ouml",    LATIN1_Ouml,   "O\"", 0x00d6},
   1.303 ++  { "Phi",     0,             "F",   0x03a6},
   1.304 ++  { "Pi",      0,             "P",   0x03a0},
   1.305 ++  { "Prime",   0,             "''",        },
   1.306 ++  { "Psi",     0,             "PS",  0x03a8},
   1.307 ++  { "QUOT",    0,             "\"",        },
   1.308 ++  { "Rho",     0,             "R",   0x03a1},
   1.309 ++  { "Scaron",  0,             "S",   0x0161},
   1.310 ++  { "Sigma",   0,             "S",   0x03a3},
   1.311 ++  { "THORN",   LATIN1_THORN,  "TH",  0x00de},
   1.312 ++  { "Tau",     0,             "T",   0x03a4},
   1.313 ++  { "Theta",   0,             "TH",  0x0398},
   1.314 ++  { "Uacute",  LATIN1_Uacute, "U'",  0x00da},
   1.315 ++  { "Ucirc",   LATIN1_Ucirc,  "U^",  0x00db},
   1.316 ++  { "Ugrave",  LATIN1_Ugrave, "U`",  0x00d9},
   1.317 ++  { "Upsilon", 0,             "U",   0x03a5},
   1.318 ++  { "Uuml",    LATIN1_Uuml,   "U\"", 0x00dc},
   1.319 ++  { "Xi",      0,             "X",   0x039e},
   1.320 ++  { "Yacute",  LATIN1_Yacute, "Y'",  0x00dd},
   1.321 ++  { "Yuml",    0,             "Y\"", 0x0178},
   1.322 ++  { "Zeta",    0,             "Z",   0x0396},
   1.323 ++  { "aacute",  LATIN1_aacute, "a'",  0x00e1},
   1.324 ++  { "acirc",   LATIN1_acirc,  "a^",  0x00e2},
   1.325 ++  { "acute",   LATIN1_acute,  "'",   0x00b4},
   1.326 ++  { "aelig",   LATIN1_aelig,  "ae",  0x00e6},
   1.327 ++  { "agrave",  LATIN1_agrave, "a`",  0x00e0},
   1.328 +   { "alefsym", 0,             "Aleph"      },
   1.329 +-  { "alpha",   0,             "a"          },
   1.330 ++  { "alpha",   0,             "a",   0x03b1},
   1.331 +   { "amp",     0,             "&"          },
   1.332 +   { "and",     0,             "AND"        },
   1.333 +   { "ang",     0,             "-V"         },
   1.334 +   { "apos",    0,             "'"          },
   1.335 +-  { "aring",   LATIN1_aring,  "aa"         },
   1.336 +-  { "asymp",   0,             "~="         },
   1.337 +-  { "atilde",  LATIN1_atilde, "a~"         },
   1.338 +-  { "auml",    LATIN1_auml,   "a\""        },
   1.339 ++  { "aring",   LATIN1_aring,  "aa",  0x00e5},
   1.340 ++  { "asymp",   0,             "~=",  0x2248},
   1.341 ++  { "atilde",  LATIN1_atilde, "a~",  0x00e3},
   1.342 ++  { "auml",    LATIN1_auml,   "a\"", 0x00e5},
   1.343 +   { "bdquo",   0,             "\""         },
   1.344 +-  { "beta",    0,             "b"          },
   1.345 +-  { "brvbar",  LATIN1_brvbar, "|"          },
   1.346 +-  { "bull",    0,             " o "        },
   1.347 ++  { "beta",    0,             "b",   0x03b2},
   1.348 ++  { "brvbar",  LATIN1_brvbar, "|",   0x00a6},
   1.349 ++  { "bull",    0,             " o ", 0x2022},
   1.350 +   { "cap",     0,             "(U"         },
   1.351 +-  { "ccedil",  LATIN1_ccedil, "c,"         },
   1.352 +-  { "cedil",   LATIN1_cedil,  ","          },
   1.353 +-  { "cent",    LATIN1_cent,   "-c-"        },
   1.354 +-  { "chi",     0,             "h"          },
   1.355 +-  { "circ",    0,             "^"          },
   1.356 ++  { "ccedil",  LATIN1_ccedil, "c,",  0x00e7},
   1.357 ++  { "cedil",   LATIN1_cedil,  ",",   0x00b8},
   1.358 ++  { "cent",    LATIN1_cent,   "-c-", 0x00a2},
   1.359 ++  { "chi",     0,             "h",   0x03c7},
   1.360 ++  { "circ",    0,             "^",   0x005e},
   1.361 + //  { "clubs",   0,             "[clubs]"    },
   1.362 +   { "cong",    0,             "?="         },
   1.363 +-  { "copy",    LATIN1_copy,   "(c)"        },
   1.364 ++  { "copy",    LATIN1_copy,   "(c)", 0x00a9},
   1.365 +   { "crarr",   0,             "<-'"        },
   1.366 +   { "cup",     0,             ")U"         },
   1.367 +-  { "curren",  LATIN1_curren, "CUR"        },
   1.368 ++  { "curren",  LATIN1_curren, "CUR", 0x00a4},
   1.369 +   { "dArr",    0,             "vv"         },
   1.370 +-  { "dagger",  0,             "+"          },
   1.371 ++  { "dagger",  0,             "+",   0x2020},
   1.372 +   { "darr",    0,             "v"          },
   1.373 +-  { "deg",     LATIN1_deg,    "DEG"        },
   1.374 +-  { "delta",   0,             "d"          },
   1.375 ++  { "deg",     LATIN1_deg,    "DEG", 0x00b0},
   1.376 ++  { "delta",   0,             "d",   0x03b4},
   1.377 + //  { "diams",   0,             "[diamonds]" },
   1.378 +-  { "divide",  LATIN1_divide, "/"          },
   1.379 +-  { "eacute",  LATIN1_eacute, "e'"         },
   1.380 +-  { "ecirc",   LATIN1_ecirc,  "e^"         },
   1.381 +-  { "egrave",  LATIN1_egrave, "e`"         },
   1.382 ++  { "divide",  LATIN1_divide, "/",   0x00f7},
   1.383 ++  { "eacute",  LATIN1_eacute, "e'",  0x00e9},
   1.384 ++  { "ecirc",   LATIN1_ecirc,  "e^",  0x00ea},
   1.385 ++  { "egrave",  LATIN1_egrave, "e`",  0x00e8},
   1.386 +   { "empty",   0,             "{}"         },
   1.387 +-  { "epsilon", 0,             "e"          },
   1.388 +-  { "equiv",   0,             "=="         },
   1.389 +-  { "eta",     0,             "e"          },
   1.390 +-  { "eth",     LATIN1_eth,    "d-"         },
   1.391 +-  { "euml",    LATIN1_euml,   "e\""        },
   1.392 +-  { "euro",    0,             "EUR"        },
   1.393 ++  { "epsilon", 0,             "e",   0x03b5},
   1.394 ++  { "equiv",   0,             "==",  0x2261},
   1.395 ++  { "eta",     0,             "e",   0x03b7},
   1.396 ++  { "eth",     LATIN1_eth,    "d-",  0x00f0},
   1.397 ++  { "euml",    LATIN1_euml,   "e\"", 0x00eb},
   1.398 ++  { "euro",    0,             "EUR", 0x20ac},
   1.399 +   { "exist",   0,             "TE"         },
   1.400 +   { "fnof",    0,             "f"          },
   1.401 +   { "forall",  0,             "FA"         },
   1.402 +-  { "frac12",  LATIN1_frac12, " 1/2"       },
   1.403 +-  { "frac14",  LATIN1_frac14, " 1/4"       },
   1.404 +-  { "frac34",  LATIN1_frac34, " 3/4"       },
   1.405 ++  { "frac12",  LATIN1_frac12, " 1/2",0x00bd},
   1.406 ++  { "frac14",  LATIN1_frac14, " 1/4",0x00bc},
   1.407 ++  { "frac34",  LATIN1_frac34, " 3/4",0x00be},
   1.408 +   { "frasl",   0,             "/"          },
   1.409 +-  { "gamma",   0,             "g"          },
   1.410 +-  { "ge",      0,             ">="         },
   1.411 +-  { "gt",      0,             ">"          },
   1.412 ++  { "gamma",   0,             "g",   0x03b3},
   1.413 ++  { "ge",      0,             ">=",  0x2265},
   1.414 ++  { "gt",      0,             ">",   0x003e},
   1.415 +   { "hArr",    0,             "<=>"        },
   1.416 +   { "harr",    0,             "<->"        },
   1.417 + //  { "hearts",  0,             "[hearts]"   },
   1.418 +-  { "hellip",  0,             "..."        },
   1.419 +-  { "iacute",  LATIN1_iacute, "i'"         },
   1.420 +-  { "icirc",   LATIN1_icirc,  "i^"         },
   1.421 +-  { "iexcl",   LATIN1_iexcl,  "!"          },
   1.422 +-  { "igrave",  LATIN1_igrave, "i`"         },
   1.423 ++  { "hellip",  0,             "...", 0x2026},
   1.424 ++  { "iacute",  LATIN1_iacute, "i'",  0x00ed},
   1.425 ++  { "icirc",   LATIN1_icirc,  "i^",  0x00ee},
   1.426 ++  { "iexcl",   LATIN1_iexcl,  "!",   0x00a1},
   1.427 ++  { "igrave",  LATIN1_igrave, "i`",  0x00ec},
   1.428 +   { "image",   0,             "Im"         },
   1.429 +-  { "infin",   0,             "oo"         },
   1.430 +-  { "int",     0,             "INT"        },
   1.431 +-  { "iota",    0,             "i"          },
   1.432 +-  { "iquest",  LATIN1_iquest, "?"          },
   1.433 ++  { "infin",   0,             "oo",  0x221e},
   1.434 ++  { "int",     0,             "INT", 0x222b},
   1.435 ++  { "iota",    0,             "i",   0x03b9},
   1.436 ++  { "iquest",  LATIN1_iquest, "?",   0x00bf},
   1.437 +   { "isin",    0,             "(-"         },
   1.438 +-  { "iuml",    LATIN1_iuml,   "i\""        },
   1.439 +-  { "kappa",   0,             "k"          },
   1.440 ++  { "iuml",    LATIN1_iuml,   "i\"", 0x00ef},
   1.441 ++  { "kappa",   0,             "k",   0x03ba},
   1.442 +   { "lArr",    0,             "<="         },
   1.443 +-  { "lambda",  0,             "l"          },
   1.444 ++  { "lambda",  0,             "l",   0x03bb},
   1.445 +   { "lang",    0,             "</"         },
   1.446 +   { "laquo",   LATIN1_laquo,  "<<"         },
   1.447 +-  { "larr",    0,             "<-"         },
   1.448 ++  { "larr",    0,             "<-",  0x2190},
   1.449 + //  { "lceil",   0,             "<|"         },
   1.450 +   { "ldquo",   0,             "\""         },
   1.451 +-  { "le",      0,             "<="         },
   1.452 ++  { "le",      0,             "<=",  0x2264},
   1.453 + //  { "lfloor",  0,             "|<"         },
   1.454 +   { "lowast",  0,             "*"          },
   1.455 +   { "loz",     0,             "<>"         },
   1.456 +   { "lsaquo",  0,             "<"          },
   1.457 +   { "lsquo",   0,             "`"          },
   1.458 +-  { "lt",      0,             "<"          },
   1.459 +-  { "macr",    LATIN1_macr,   "-"          },
   1.460 ++  { "lt",      0,             "<",   0x003c},
   1.461 ++  { "macr",    LATIN1_macr,   "-",   0x00af},
   1.462 +   { "mdash",   0,             "--"         },
   1.463 +-  { "micro",   LATIN1_micro,  "my"         },
   1.464 +-  { "middot",  LATIN1_middot, "."          },
   1.465 +-  { "minus",   0,             "-"          },
   1.466 +-  { "mu",      0,             "m"          },
   1.467 ++  { "micro",   LATIN1_micro,  "my",  0x00b5},
   1.468 ++  { "middot",  LATIN1_middot, ".",   0x00b7},
   1.469 ++  { "minus",   0,             "-",   0x2212},
   1.470 ++  { "mu",      0,             "m",   0x03bc},
   1.471 +   { "nabla",   0,             "Nabla"      },
   1.472 +-  { "nbsp",    LATIN1_nbsp,   " "          },
   1.473 ++  { "nbsp",    LATIN1_nbsp,   " ",   0x00a0},
   1.474 +   { "ndash",   0,             "-"          },
   1.475 +-  { "ne",      0,             "!="         },
   1.476 ++  { "ne",      0,             "!=",  0x2260},
   1.477 +   { "ni",      0,             "-)"         },
   1.478 +   { "not",     LATIN1_not,    "NOT"        },
   1.479 +   { "notin",   0,             "!(-"        },
   1.480 +   { "nsub",    0,             "!(C"        },
   1.481 +-  { "ntilde",  LATIN1_ntilde, "n~"         },
   1.482 +-  { "nu",      0,             "n"          },
   1.483 +-  { "oacute",  LATIN1_oacute, "o'"         },
   1.484 +-  { "ocirc",   LATIN1_ocirc,  "o^"         },
   1.485 ++  { "ntilde",  LATIN1_ntilde, "n~",  0x00f1},
   1.486 ++  { "nu",      0,             "n",   0x03bd},
   1.487 ++  { "oacute",  LATIN1_oacute, "o'",  0x00f3},
   1.488 ++  { "ocirc",   LATIN1_ocirc,  "o^",  0x00f4},
   1.489 +   { "oelig",   0,             "oe"         },
   1.490 +-  { "ograve",  LATIN1_ograve, "o`"         },
   1.491 ++  { "ograve",  LATIN1_ograve, "o`",  0x00f2},
   1.492 +   { "oline",   LATIN1_macr,   "-"          },
   1.493 +-  { "omega",   0,             "o"          },
   1.494 +-  { "omicron", 0,             "o"          },
   1.495 ++  { "omega",   0,             "o",   0x03c9},
   1.496 ++  { "omicron", 0,             "o",   0x03bf},
   1.497 +   { "oplus",   0,             "(+)"        },
   1.498 +   { "or",      0,             "OR"         },
   1.499 +-  { "ordf",    LATIN1_ordf,   "-a"         },
   1.500 +-  { "ordm",    LATIN1_ordm,   "-o"         },
   1.501 +-  { "oslash",  LATIN1_oslash, "o/"         },
   1.502 +-  { "otilde",  LATIN1_otilde, "o~"         },
   1.503 ++  { "ordf",    LATIN1_ordf,   "-a",  0x00aa},
   1.504 ++  { "ordm",    LATIN1_ordm,   "-o",  0x00ba},
   1.505 ++  { "oslash",  LATIN1_oslash, "o/",  0x00f8},
   1.506 ++  { "otilde",  LATIN1_otilde, "o~",  0x00f5},
   1.507 +   { "otimes",  0,             "(x)"        },
   1.508 +-  { "ouml",    LATIN1_ouml,   "o\""        },
   1.509 +-  { "para",    LATIN1_para,   "P:"         },
   1.510 +-  { "part",    0,             "PART"       },
   1.511 +-  { "permil",  0,             " 0/00"      },
   1.512 ++  { "ouml",    LATIN1_ouml,   "o\"", 0x00f6},
   1.513 ++  { "para",    LATIN1_para,   "P:",  0x00b6},
   1.514 ++  { "part",    0,             "PART",0x2202},
   1.515 ++  { "permil",  0,             " 0/00",0x2030},
   1.516 +   { "perp",    0,             "-T"         },
   1.517 +-  { "phi",     0,             "f"          },
   1.518 +-  { "pi",      0,             "p"          },
   1.519 ++  { "phi",     0,             "f",   0x03c6},
   1.520 ++  { "pi",      0,             "p",   0x03c0},
   1.521 +   { "piv",     0,             "Pi"         },
   1.522 +-  { "plusmn",  LATIN1_plusmn, "+/-"        },
   1.523 +-  { "pound",   LATIN1_pound,  "-L-"        },
   1.524 ++  { "plusmn",  LATIN1_plusmn, "+/-", 0x00b1},
   1.525 ++  { "pound",   LATIN1_pound,  "-L-", 0x00a3},
   1.526 +   { "prime",   0,             "'"          },
   1.527 +-  { "prod",    0,             "PROD"       },
   1.528 ++  { "prod",    0,             "PROD",0x220f},
   1.529 +   { "prop",    0,             "0("         },
   1.530 +-  { "psi",     0,             "ps"         },
   1.531 ++  { "psi",     0,             "ps",  0x03c8},
   1.532 +   { "quot",    0,             "\""         },
   1.533 +   { "rArr",    0,             "=>"         },
   1.534 +-  { "radic",   0,             "SQRT"       },
   1.535 ++  { "radic",   0,             "SQRT",0x221a},
   1.536 +   { "rang",    0,             "/>"         },
   1.537 +   { "raquo",   LATIN1_raquo,  ">>"         },
   1.538 +-  { "rarr",    0,             "->"         },
   1.539 ++  { "rarr",    0,             "->",  0x2192},
   1.540 + //  { "rceil",   0,             ">|"         },
   1.541 +   { "rdquo",   0,             "\""         },
   1.542 +   { "real",    0,             "Re"         },
   1.543 +-  { "reg",     LATIN1_reg,    "(R)"        },
   1.544 ++  { "reg",     LATIN1_reg,    "(R)", 0x00ae},
   1.545 + //  { "rfloor",  0,             "|>"         },
   1.546 +-  { "rho",     0,             "r"          },
   1.547 ++  { "rho",     0,             "r",   0x03c1},
   1.548 +   { "rsaquo",  0,             ">"          },
   1.549 +   { "rsquo",   0,             "'"          },
   1.550 +   { "sbquo",   0,             "'"          },
   1.551 +-  { "scaron",  0,             "s"          },
   1.552 ++  { "scaron",  0,             "s",   0x0161},
   1.553 +   { "sdot",    0,             "DOT"        },
   1.554 +-  { "sect",    LATIN1_sect,   "S:"         },
   1.555 ++  { "sect",    LATIN1_sect,   "S:",  0x00a7},
   1.556 +   { "shy",     LATIN1_shy,    ""           },
   1.557 +-  { "sigma",   0,             "s"          },
   1.558 +-  { "sigmaf",  0,             "s"          },
   1.559 ++  { "sigma",   0,             "s",   0x03c3},
   1.560 ++  { "sigmaf",  0,             "s",   0x03c2},
   1.561 +   { "sim",     0,             "~"          },
   1.562 + //  { "spades",  0,             "[spades]"   },
   1.563 +   { "sub",     0,             "(C"         },
   1.564 +   { "sube",    0,             "(_"         },
   1.565 +-  { "sum",     0,             "SUM"        },
   1.566 ++  { "sum",     0,             "SUM", 0x2211},
   1.567 +   { "sup",     0,             ")C"         },
   1.568 +-  { "sup1",    LATIN1_sup1,   "^1"         },
   1.569 +-  { "sup2",    LATIN1_sup2,   "^2"         },
   1.570 +-  { "sup3",    LATIN1_sup3,   "^3"         },
   1.571 ++  { "sup1",    LATIN1_sup1,   "^1",  0x00b9},
   1.572 ++  { "sup2",    LATIN1_sup2,   "^2",  0x00b2},
   1.573 ++  { "sup3",    LATIN1_sup3,   "^3",  0x00b3},
   1.574 +   { "supe",    0,             ")_"         },
   1.575 +-  { "szlig",   LATIN1_szlig,  "ss"         },
   1.576 +-  { "tau",     0,             "t"          },
   1.577 ++  { "szlig",   LATIN1_szlig,  "ss",  0x00df},
   1.578 ++  { "tau",     0,             "t",   0x03c4},
   1.579 +   { "there4",  0,             ".:"         },
   1.580 +-  { "theta",   0,             "th"         },
   1.581 +-  { "thorn",   LATIN1_thorn,  "th"         },
   1.582 +-  { "tilde",   0,             "~"          },
   1.583 +-  { "times",   LATIN1_times,  "x"          },
   1.584 +-  { "trade",   0,             "[TM]"       },
   1.585 ++  { "theta",   0,             "th",  0x03b8},
   1.586 ++  { "thorn",   LATIN1_thorn,  "th",  0x00fe},
   1.587 ++  { "tilde",   0,             "~",   0x02dc},
   1.588 ++  { "times",   LATIN1_times,  "x",   0x00d7},
   1.589 ++  { "trade",   0,             "[TM]",0x2122},
   1.590 +   { "uArr",    0,             "^^"         },
   1.591 +-  { "uacute",  LATIN1_uacute, "u'"         },
   1.592 ++  { "uacute",  LATIN1_uacute, "u'",  0x00fa},
   1.593 +   { "uarr",    0,             "^"          },
   1.594 +-  { "ucirc",   LATIN1_ucirc,  "u^"         },
   1.595 +-  { "ugrave",  LATIN1_ugrave, "u`"         },
   1.596 +-  { "uml",     LATIN1_uml,    "\""         },
   1.597 +-  { "upsilon", 0,             "u"          },
   1.598 +-  { "uuml",    LATIN1_uuml,   "u\""        },
   1.599 ++  { "ucirc",   LATIN1_ucirc,  "u^",  0x00fb},
   1.600 ++  { "ugrave",  LATIN1_ugrave, "u`",  0x00f9},
   1.601 ++  { "uml",     LATIN1_uml,    "\"",  0x00a8},
   1.602 ++  { "upsilon", 0,             "u",   0x03c5},
   1.603 ++  { "uuml",    LATIN1_uuml,   "u\"", 0x00fc},
   1.604 +   { "weierp",  0,             "P"          },
   1.605 +-  { "xi",      0,             "x"          },
   1.606 +-  { "yacute",  LATIN1_yacute, "y'"         },
   1.607 +-  { "yen",     LATIN1_yen,    "YEN"        },
   1.608 +-  { "yuml",    LATIN1_yuml,   "y\""        },
   1.609 +-  { "zeta",    0,             "z"          },
   1.610 ++  { "xi",      0,             "x",   0x03be},
   1.611 ++  { "yacute",  LATIN1_yacute, "y'",  0x00fd},
   1.612 ++  { "yen",     LATIN1_yen,    "YEN", 0x00a5},
   1.613 ++  { "yuml",    LATIN1_yuml,   "y\"", 0x00ff},
   1.614 ++  { "zeta",    0,             "z",   0x03b6},
   1.615 + };
   1.616 + 
   1.617 +-extern int use_iso8859;
   1.618 ++extern int use_encoding;
   1.619 + 
   1.620 + /* ------------------------------------------------------------------------- */
   1.621 + 
   1.622 ++char ubuf[4];
   1.623 ++
   1.624 ++char *mkutf(unsigned long x)
   1.625 ++{
   1.626 ++  memset(ubuf, 0, 4);
   1.627 ++  if(x < 128) ubuf[0] = x;
   1.628 ++  else if(x < 0x800) {
   1.629 ++     ubuf[0] = (0xc0 | ((x >> 6) & 0x1f));
   1.630 ++     ubuf[1] = (0x80 | (x & 0x3f));
   1.631 ++  }
   1.632 ++  else {
   1.633 ++     ubuf[0] = (0xe0 | ((x >> 12) & 0x0f));
   1.634 ++     ubuf[1] = (0x80 | ((x >> 6) & 0x3f));
   1.635 ++     ubuf[2] = (0x80 | (x & 0x3f));
   1.636 ++  }
   1.637 ++  return ubuf;
   1.638 ++}
   1.639 ++
   1.640 + void
   1.641 + replace_sgml_entities(string *s)
   1.642 + {
   1.643 +@@ -330,9 +349,9 @@
   1.644 +      */
   1.645 +     while (j < l && s->at(j) != '&') ++j;
   1.646 +     /*
   1.647 +-     * We could convert high-bit chars to "&#233;" here if use_iso8859
   1.648 +-     * is off, then let them be translated or not.  Is the purpose of
   1.649 +-     * !use_iso8859 to allow SGML entities to be seen, or to strongly
   1.650 ++     * We could convert high-bit chars to "&#233;" here if USE_ASCII
   1.651 ++     * is on, then let them be translated or not.  Is the purpose of
   1.652 ++     * USE_ASCII to allow SGML entities to be seen, or to strongly
   1.653 +      * filter against high-ASCII chars that might blow up a terminal
   1.654 +      * that doesn't speak ISO8859?  For the moment, "allow SGML entities
   1.655 +      * to be seen" -- no filtering here.
   1.656 +@@ -370,7 +389,11 @@
   1.657 +           if (!isdigit(c)) break;
   1.658 +           x = 10 * x + c - '0';
   1.659 +         }
   1.660 +-        if (use_iso8859 || (x < 128)) {
   1.661 ++        if (USE_UTF8) {
   1.662 ++          s->replace(beg, j - beg, mkutf(x));
   1.663 ++          j = beg + 1;
   1.664 ++        }
   1.665 ++        else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) {
   1.666 +         s->replace(beg, j - beg, 1, (char) x);
   1.667 +         j = beg + 1;
   1.668 +         } else {
   1.669 +@@ -408,13 +431,17 @@
   1.670 +         (int (*)(const void *, const void *)) strcmp
   1.671 +       );
   1.672 +       if (entity != NULL) {
   1.673 +-        if (use_iso8859 && entity->iso8859code) {
   1.674 ++        if (USE_ISO8859 && entity->iso8859code) {
   1.675 +           s->replace(beg, j - beg, 1, (char) entity->iso8859code);
   1.676 +           j = beg + 1;
   1.677 +-        } else if (entity->asciistr) {
   1.678 ++        } else if (USE_ASCII && entity->asciistr) {
   1.679 +           s->replace(beg, j - beg, entity->asciistr);
   1.680 +         j = beg + 1;
   1.681 +         } /* else don't replace it at all, we don't have a translation */
   1.682 ++        else if(USE_UTF8 && entity->unicode) {
   1.683 ++        s->replace(beg, j - beg, mkutf(entity->unicode));
   1.684 ++        j = beg + 1;
   1.685 ++        }
   1.686 +       }
   1.687 +     } else {
   1.688 +       ;                         /* EXTENSION: Allow literal '&' sometimes. */
   1.689 +diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C
   1.690 +--- html2text-1.3.2a/table.C	2002-07-22 13:32:50.000000000 +0200
   1.691 ++++ html2text-1.3.2a-patched/table.C	2005-05-13 22:19:59.871136320 +0200
   1.692 +@@ -175,7 +175,7 @@
   1.693 +           - (*number_of_columns_return - 1) * (column_spacing + 0),
   1.694 +           Area::LEFT // Yields better results than "p->halign"!
   1.695 +         ));
   1.696 +-	p->width = tmp.get() ? tmp->width() : 0;
   1.697 ++	p->width = tmp.get() ? tmp->utf_width() : 0;
   1.698 +       }
   1.699 +       p->minimized = false;
   1.700 + 
   1.701 +@@ -308,7 +308,7 @@
   1.702 + 	left_of_column + old_column_width - 1,
   1.703 + 	Area::LEFT // Yields better results than "lc.halign"!
   1.704 +       ));
   1.705 +-      w = tmp->width();
   1.706 ++      w = tmp->utf_width();
   1.707 +       if (w >= left_of_column + old_column_width) lc.minimized = true;
   1.708 +     }
   1.709 +     if (w > left_of_column + new_column_width) {
author	Pascal Bellard <pascal.bellard@slitaz.org>
date	Wed Jan 22 11:18:51 2020 +0100 (2020-01-22)
parents
children