CharsetConv.h

/**
 *  @file   CharsetConv.h
 *  @brief  iconv または mlang を用いた文字コード変換.
 *  @author tenk* (Masashi Kitamura)
 *  @note
 *  -   win環境では、予め　CoInitialize() されている必要がある.
 *  -   実質 char系エンコーディングのみの対応で、wchar_t(UTF-16)系は未対応.
 *  -   class仕様を mlang,iconvで合わせて、だいたいの使い方を似せているだけで、
 *      細かい動作は違うし、エンコードの指定は、各々の事情の名前を使う必要が
 *      あるため、完全な置き換え等は望まないこと.
 *  -   Public Domain Software
 */

#ifndef CHARSETCONV_H_INCLUDED
#define CHARSETCONV_H_INCLUDED


/* 基本的に、このようなクラス.
    class CharsetConv {
    public:
        CharsetConv();
        ~CharsetConv();
        CharsetConv(const TCHAR* dstEnc, const TCHAR* srcEnc);  // open有コンストラクタ.
        int open(const TCHAR* dstEnc, const TCHAR* srcEnc);     // 開始.
        void close();                                           // 終わり.
        size_t conv(char pDst[], size_t dstBytes, const char* pSrc, size_t srcBytes);               // 指定サイズの変換.
        size_t strConv(char pDst[], size_t dstSize, const char* pSrc, size_t srcSize=size_t(-1));   // \0文字列の変換.
        void reset();                                           // 状態のリセット( iconv 時のみ )
     };
*/

#if defined _MSC_VER // =======================================================
// mlang がんばれば他のコンパイラも可能のようだけど、実質 vc のみ...

#include <windows.h>
#include <stddef.h>
#include <assert.h>
#include <tchar.h>
#include <comdef.h>
#include <mlang.h>


/// mlang を用いた文字コード変換.
class CharsetConv {
public:
    CharsetConv() : mlang_(0), buf2_(0) { init(); }

    ~CharsetConv() {
        close();
        if (mlang_)
            mlang_->Release();
    }

    CharsetConv(const TCHAR* dstEnc, const TCHAR* srcEnc) {
        init();
        open(dstEnc, srcEnc);
    }

    int open(const TCHAR* dstEnc, const TCHAR* srcEnc) {
        assert(!!mlang_);
        if (mlang_) {
            MIMECSETINFO    mi;
            assert(dstEnc && srcEnc);
            srcEnc_ = _tcstol(srcEnc, 0, 0);
            if (srcEnc_ == 0) {
                mlang_->GetCharsetInfo(_bstr_t(srcEnc), &mi );
                srcEnc_ = mi.uiInternetEncoding;
            }
            dstEnc_ = _tcstol(dstEnc, 0, 0);
            if (dstEnc_ == 0) {
                mlang_->GetCharsetInfo(_bstr_t(dstEnc), &mi );
                dstEnc_ = mi.uiInternetEncoding;
            }
            {
                utf8EucJpFlag_ = 0;
                if (srcEnc_ == 51932 && dstEnc_ == 65001) { // euc-jp => utf-8
                    utf8EucJpFlag_ |= 1;
                } else if (srcEnc_ == 65001 && dstEnc_ == 51932) {  // utf-8 => euc-jp
                    utf8EucJpFlag_ |= 2;
                }
                if (utf8EucJpFlag_) {
                    buf2_         = (char*)::operator new( 0x1000 );
                    buf2size_     = 0x1000;
                }
            }
            return 0;
        }
        return -1;
    }

    void close() {
        if (buf2_)
            ::operator delete(buf2_);
    }

    /** 指定サイズの文字列の変換. \0を終端扱いにしないので注意.
     *  ※ 現状 UTF-16系はできそうで出来ない模様orz
     *  @param pDst      変換後の文字列を収めるバッファ
     *  @param dstBytes  pDstの領域バイト数.
     *  @param pSrc      変換する文字列
     *  @param srcBytes  pSrcのバイト数.
     *  @return  -1:エラー  以外=変換したバイト数.
     */
    size_t conv(char pDst[], size_t dstBytes, const char* pSrc, size_t srcBytes) {
        assert(!!mlang_);
        assert(pDst && pSrc && dstBytes > 0 && srcBytes > 0 && srcBytes < size_t(-1));
        DWORD    dwMode = 0;
        UINT     srcSz  = UINT(srcBytes);
        UINT     dstSz  = UINT(dstBytes);
        unsigned srcEnc = srcEnc_;
        if (utf8EucJpFlag_) {   // EUC-Jp <=> UTF-8 直接変換ができないようなので、SJIS経由にする.
            if (convUtf8EucJp(pSrc, srcSz, srcBytes))
                return size_t(-1);
            srcEnc = 932;
        }
        HRESULT rc     = mlang_->ConvertString(&dwMode, srcEnc, dstEnc_, (LPBYTE)pSrc , &srcSz, (LPBYTE)pDst , &dstSz );
        return (rc != S_FALSE) ? dstSz : size_t(-1);
    }

    /** char系文字列のみ対応で、\0終端文字列の変換.
     *  @return -1:エラー  以外:変換したバイト数(\0含まず)
     */
    size_t strConv(char pDst[], size_t dstSize, const char* pSrc, size_t srcSize=size_t(-1)) {
        size_t  srcLen = strlen(pSrc);
        UINT    srcSz  = srcSize < srcLen ? UINT(srcSize) : srcLen;
        size_t  dstSz  = conv(pDst, dstSize, pSrc, srcSz);
        if (dstSz != size_t(-1)) {
            if (dstSz >= dstSize)
                dstSz = dstSize - 1;
            pDst[dstSz] = 0;
        }
        return dstSz;
    }

    void reset() { /* mlang_->ConvertStringReset(); */ }

private:
    CharsetConv(const CharsetConv&);
    void operator=(const CharsetConv&);

    void init() {
        buf2_  = 0;
        mlang_ = 0;
        //if (FAILED(::CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_ALL, IID_IMultiLanguage3, (LPVOID*)&mlang_)))
        {
            if (FAILED(::CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_ALL, IID_IMultiLanguage2, (LPVOID*)&mlang_)))
                mlang_ = 0;
        }
    }

    /// EUC-Jp <=> UTF-8 直接変換ができないようなので、SJIS経由にする.
    int convUtf8EucJp(const char*& pSrc, UINT& srcSz, size_t srcBytes) {
        if (srcBytes > buf2size_) {
            ::operator delete(buf2_);
            buf2size_ = srcBytes;
            buf2_     = (char*)::operator new(srcBytes);
            if (buf2_ == 0)
                return size_t(-1);
        }
        UINT    buf2size = buf2size_;
        DWORD   dwMode = 0;
        HRESULT rc = mlang_->ConvertString(&dwMode, srcEnc_, 932, (LPBYTE)pSrc , &srcSz, (LPBYTE)buf2_, &buf2size );
        if (rc != S_OK)
            return size_t(-1);
        pSrc    = buf2_;
        srcSz   = buf2size;
        return 0;
    }

private:
    IMultiLanguage3*    mlang_;
    unsigned            srcEnc_;
    unsigned            dstEnc_;
    unsigned            utf8EucJpFlag_;
    unsigned            buf2size_;
    char*               buf2_;
};



/// 現在の言語のエンコード名の取得(いろいろ不十分だが手抜きで)
class CharsetConv_Helper {
public:
    static const TCHAR* getCurrentCharset() {
        static TCHAR    buf[64];
        _ultot(GetConsoleOutputCP(), buf, 10);
        return buf;
    }
};


#else // linux / unix       // ===============================================

#include <iconv.h>
#include <stddef.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>


/// iconv を用いた文字コード変換.
class CharsetConv {
public:
    CharsetConv() : icd_(0) {}
    ~CharsetConv() { close(); }

    CharsetConv(const char* dstEnc, const char* srcEnc) { open(dstEnc, srcEnc); }

    bool open(const char* dstEnc, const char* srcEnc) {
        assert(dstEnc && srcEnc);
      #ifdef __GNUC__   // GNU libiconv が使われている場合.
        char src[128], dst[128];
        snprintf(src, 128, "%s//TRANSLIT", srcEnc);
        snprintf(dst, 128, "%s//TRANSLIT", dstEnc);
        icd_ = iconv_open(dst   , src   );
      #else
        icd_ = iconv_open(dstEnc, srcEnc);
      #endif
        return 0;
    }

    void close() {
        iconv_close(icd_);
    }

    /** 指定サイズの文字列の変換. \0を終端扱いにしないので注意.
     *  ※ win側にあわせて、詳細なエラー情報はあきらめる...
     *  @param pDst      変換後の文字列を収めるバッファ
     *  @param dstBytes  pDstの領域バイト数.
     *  @param pSrc      変換する文字列
     *  @param srcBytes  pSrcのバイト数.
     *  @return  -1:エラー  以外=変換したバイト数.
     */
    size_t conv(char pDst[], size_t dstSize, const char* pSrc, size_t srcSize) {
        assert( icd_ != 0);
        assert(pDst && pSrc && dstSize > 0 && srcSize > 0);
        char*       s = (char*)pSrc;
        char*       d = pDst;
        size_t  rc= iconv(icd_, &s, &srcSize, &d, &dstSize);
        if (rc == size_t(-1))
            return size_t(-1);
        return d - pDst;
    }

    /** char系文字列のみ対応で、\0終端文字列の変換.
     *  @return -1:エラー  以外:変換したバイト数(\0含まず)
     */
    size_t strConv(char pDst[], size_t dstSize, const char* pSrc, size_t srcSize=size_t(-1)) {
        assert( icd_ != 0);
        assert(pDst && pSrc && dstSize > 0 && srcSize > 0);
        size_t  l     = strlen(pSrc);
        size_t  srcSz = srcSize < l ? srcSize : l;
        size_t  dstSz = dstSize;
        char*   s     = (char*)pSrc;
        char*   d     = pDst;
        size_t  rc    = iconv(icd_, &s, &srcSz, &d, &dstSz);
        dstSz         = d - pDst;
        if (dstSz >= dstSize)
            dstSz     = dstSize - 1;
        pDst[dstSz]   = 0;
        if (rc == size_t(-1))
            dstSz     = size_t(-1);
        return dstSz;
    }

    void reset() {
        iconv(icd_, NULL, NULL, NULL, NULL);
    }

private:
    CharsetConv(const CharsetConv&);
    void operator=(const CharsetConv&);

private:
    iconv_t  icd_;
};



/// 現在の言語のエンコード名の取得(いろいろ不十分だが手抜きで)
class CharsetConv_Helper {
public:
    static const char* getCurrentCharset() {
      #if 1 //ndef _WIN32
        const char* env = getenv("LANG");
        if (env) {
            const char* p = strrchr(env, '.');
            if (p)
                return p+1;
        }
      #endif
        return "";
    }
};


#endif  // ====================================================================


#endif // CHARSETCONV_H_INCLUDED