Skip to content

Instantly share code, notes, and snippets.

@dweinstein
Last active October 21, 2016 01:12
Show Gist options
  • Save dweinstein/3125bed0a478e2b0acfccfae91c90fd5 to your computer and use it in GitHub Desktop.
Save dweinstein/3125bed0a478e2b0acfccfae91c90fd5 to your computer and use it in GitHub Desktop.

Revisions

  1. dweinstein revised this gist Oct 11, 2016. 1 changed file with 2 additions and 4 deletions.
    6 changes: 2 additions & 4 deletions guess-encoding.js
    Original file line number Diff line number Diff line change
    @@ -28,13 +28,11 @@ const UTF_8_CONTINUE_MATCH = 0x80;
    module.exports.zipGuessEncoding = zipGuessEncoding;

    function zipGuessEncoding (buf, expectedEncoding) {
    let enc = ZIP_ENCODING_UNKNOWN;

    let enc = ZIP_ENCODING_ASCII;
    if (!buf) {
    return ZIP_ENCODING_ASCII;
    return enc;
    }

    enc = ZIP_ENCODING_ASCII;
    for (let i=0; i < buf.length; ++i) {
    if ((buf[i] > 31 && buf[i] < 128) || buf[i] == '\r' || buf[i] == '\n' || buf[i] == '\t') {
    continue;
  2. dweinstein revised this gist Oct 11, 2016. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions guess.md
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    taken from libzip's [`_zip_guess_encoding`](https://fossies.org/dox/libzip-1.1.3/zip__utf-8_8c.html#af395901a6dad7f34240d186211ea72b9).

    ```c
    zip_encoding_type_t
    117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
  3. dweinstein revised this gist Oct 11, 2016. No changes.
  4. dweinstein revised this gist Oct 11, 2016. 1 changed file with 63 additions and 0 deletions.
    63 changes: 63 additions & 0 deletions guess.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    ```c
    zip_encoding_type_t
    117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
    118 {
    119 zip_encoding_type_t enc;
    120 const zip_uint8_t *name;
    121 zip_uint32_t i, j, ulen;
    122
    123 if (str == NULL)
    124 return ZIP_ENCODING_ASCII;
    125
    126 name = str->raw;
    127
    128 if (str->encoding != ZIP_ENCODING_UNKNOWN)
    129 enc = str->encoding;
    130 else {
    131 enc = ZIP_ENCODING_ASCII;
    132 for (i=0; i<str->length; i++) {
    133 if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
    134 continue;
    135
    136 enc = ZIP_ENCODING_UTF8_GUESSED;
    137 if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
    138 ulen = 1;
    139 else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
    140 ulen = 2;
    141 else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
    142 ulen = 3;
    143 else {
    144 enc = ZIP_ENCODING_CP437;
    145 break;
    146 }
    147
    148 if (i + ulen >= str->length) {
    149 enc = ZIP_ENCODING_CP437;
    150 break;
    151 }
    152
    153 for (j=1; j<=ulen; j++) {
    154 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
    155 enc = ZIP_ENCODING_CP437;
    156 goto done;
    157 }
    158 }
    159 i += ulen;
    160 }
    161 }
    162
    163 done:
    164 str->encoding = enc;
    165
    166 if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
    167 if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
    168 str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
    169
    170 if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
    171 return ZIP_ENCODING_ERROR;
    172 }
    173
    174 return enc;
    175 }
    176
    ```
  5. dweinstein created this gist Oct 11, 2016.
    84 changes: 84 additions & 0 deletions guess-encoding.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,84 @@
    'use strict';

    const ZIP_ENCODING_UNKNOWN = 0;
    const ZIP_ENCODING_ASCII = 1;
    const ZIP_ENCODING_UTF8_KNOWN = 2;
    const ZIP_ENCODING_UTF8_GUESSED = 3;
    const ZIP_ENCODING_CP437 = 4;
    const ZIP_ENCODING_ERROR = 5;

    module.exports.zipEncodings = {
    unknown: ZIP_ENCODING_UNKNOWN,
    ascii: ZIP_ENCODING_ASCII,
    utf8known: ZIP_ENCODING_UTF8_KNOWN,
    utf8guessed: ZIP_ENCODING_UTF8_GUESSED,
    cp437: ZIP_ENCODING_CP437,
    error: ZIP_ENCODING_ERROR
    }

    const UTF_8_LEN_2_MASK = 0xe0;
    const UTF_8_LEN_2_MATCH = 0xc0;
    const UTF_8_LEN_3_MASK = 0xf0;
    const UTF_8_LEN_3_MATCH = 0xe0;
    const UTF_8_LEN_4_MASK = 0xf8;
    const UTF_8_LEN_4_MATCH = 0xf0;
    const UTF_8_CONTINUE_MASK = 0xc0;
    const UTF_8_CONTINUE_MATCH = 0x80;

    module.exports.zipGuessEncoding = zipGuessEncoding;

    function zipGuessEncoding (buf, expectedEncoding) {
    let enc = ZIP_ENCODING_UNKNOWN;

    if (!buf) {
    return ZIP_ENCODING_ASCII;
    }

    enc = ZIP_ENCODING_ASCII;
    for (let i=0; i < buf.length; ++i) {
    if ((buf[i] > 31 && buf[i] < 128) || buf[i] == '\r' || buf[i] == '\n' || buf[i] == '\t') {
    continue;
    }

    let ulen;
    enc = ZIP_ENCODING_UTF8_GUESSED;
    if ((buf[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
    ulen = 1;
    else if ((buf[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
    ulen = 2;
    else if ((buf[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
    ulen = 3;
    else {
    enc = ZIP_ENCODING_CP437;
    break;
    }

    if (i + ulen >= buf.length) {
    enc = ZIP_ENCODING_CP437;
    break;
    }

    for (let j=1; j<=ulen; j++) {
    if ((buf[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
    enc = ZIP_ENCODING_CP437;
    return done();
    }
    }
    i += ulen;
    }

    return done();

    function done () {
    const expected = expectedEncoding || ZIP_ENCODING_UNKNOWN;
    if (expected !== ZIP_ENCODING_UNKNOWN) {
    if (expected === ZIP_ENCODING_UTF8_KNOWN && enc === ZIP_ENCODING_UTF8_GUESSED) {
    enc = ZIP_ENCODING_UTF8_KNOWN;
    }
    if (expected !== enc && enc !== ZIP_ENCODING_ASCII) {
    return ZIP_ENCODING_ERROR;
    }
    }
    return enc;
    }
    }