첨부 실행 코드는 나눔고딕코딩 폰트를 사용합니다.
유용한 소스 코드가 있으면 icodebroker@naver.com으로 보내주시면 감사합니다.
블로그 자료는 자유롭게 사용하세요.

■ 텍스트 파일 인코딩 구하기

------------------------------------------------------------------------------------------------------------------------

TestProject.zip


Program.cs

 

 

using System;

using System.Text;

 

namespace TestProject

{

    /// <summary>

    /// 프로그램

    /// </summary>

    class Program

    {

        //////////////////////////////////////////////////////////////////////////////////////////////////// Method

        ////////////////////////////////////////////////////////////////////////////////////////// Static

        //////////////////////////////////////////////////////////////////////////////// Private

 

        #region 프로그램 시작하기 - Main()

 

        /// <summary>

        /// 프로그램 시작하기

        /// </summary>

        private static void Main()

        {

            string filePath = "c:\\sample.txt";

 

            Encoding encoding = TextFileEncodingHelper.GetTextFileEncoding(filePath);

 

            Console.WriteLine(encoding.EncodingName);

        }

 

        #endregion

    }

}

 

 

TextFileEncodingHelper.cs

 

 

using System;

using System.IO;

using System.Text;

using System.Text.RegularExpressions;

 

namespace TestProject

{

    /// <summary>

    /// 텍스트 파일 인코딩 헬퍼

    /// </summary>

    public static class TextFileEncodingHelper

    {

        //////////////////////////////////////////////////////////////////////////////////////////////////// Field

        ////////////////////////////////////////////////////////////////////////////////////////// Private

 

        #region Field

 

        /// <summary>

        /// 디폴트 휴리스틱 샘플 크기

        /// </summary>

        private const long DEFAULT_HEURISTIC_SAMPLE_SIZE = 0x10000;

 

        #endregion

 

        //////////////////////////////////////////////////////////////////////////////////////////////////// Method

        ////////////////////////////////////////////////////////////////////////////////////////// Static

        //////////////////////////////////////////////////////////////////////////////// Public

 

        #region 텍스트 파일 인코딩 구하기 - GetTextFileEncoding(stream, heuristicSampleSize, hasBOM)

 

        /// <summary>

        /// 텍스트 파일 인코딩 구하기

        /// </summary>

        /// <param name="stream">스트림</param>

        /// <param name="heuristicSampleSize">휴리스틱 샘플 크기</param>

        /// <param name="hasBOM">BOM 존재 여부</param>

        /// <returns>텍스트 파일 인코딩</returns>

        public static Encoding GetTextFileEncoding(FileStream stream, long heuristicSampleSize, out bool hasBOM)

        {

            if(stream == null)

            {

                throw new ArgumentNullException("stream이 null 입니다!", "stream");

            }

 

            if(!stream.CanRead)

            {

                throw new ArgumentException("파일 스트림을 읽을 수 없습니다!", "stream");

            }

 

            if(!stream.CanSeek)

            {

                throw new ArgumentException("파일 스트림을 탐색할 수 없습니다!", "stream");

            }

 

            Encoding encodingFound = null;

 

            long originalPosition = stream.Position;

 

            stream.Position = 0L;

 

            byte[] bomByteArray = new byte[stream.Length > 4 ? 4 : stream.Length];

 

            stream.Read(bomByteArray, 0, bomByteArray.Length);

 

            encodingFound = GetEncoding(bomByteArray);

 

            if(encodingFound != null)

            {

                stream.Position = originalPosition;

 

                hasBOM = true;

 

                return encodingFound;

            }

 

            byte[] sampleByteArray = new byte[heuristicSampleSize > stream.Length ? stream.Length : heuristicSampleSize];

 

            Array.Copy(bomByteArray, sampleByteArray, bomByteArray.Length);

 

            if(stream.Length > bomByteArray.Length)

            {

                stream.Read(sampleByteArray, bomByteArray.Length, sampleByteArray.Length - bomByteArray.Length);

            }

 

            stream.Position = originalPosition;

 

            encodingFound = GetUnicodeEncodingByHeuristics(sampleByteArray);

 

            hasBOM = false;

 

            return encodingFound;

        }

 

        #endregion

        #region 텍스트 파일 인코딩 구하기 - GetTextFileEncoding(stream, heuristicSampleSize)

 

        /// <summary>

        /// 텍스트 파일 인코딩 구하기

        /// </summary>

        /// <param name="stream">스트림</param>

        /// <param name="heuristicSampleSize">휴리스틱 샘플 크기</param>

        /// <returns>텍스트 파일 인코딩</returns>

        public static Encoding GetTextFileEncoding(FileStream stream, long heuristicSampleSize)

        {

            bool useless = false;

 

            return GetTextFileEncoding(stream, DEFAULT_HEURISTIC_SAMPLE_SIZE, out useless);

        }

 

        #endregion

        #region 텍스트 파일 인코딩 구하기 - GetTextFileEncoding(filePath)

 

        /// <summary>

        /// 텍스트 파일 인코딩 구하기

        /// </summary>

        /// <param name="filePath">파일 경로</param>

        /// <returns>텍스트 파일 인코딩</returns>

        public static Encoding GetTextFileEncoding(string filePath)

        {

            using(FileStream stream = File.OpenRead(filePath))

            {

                return GetTextFileEncoding(stream, DEFAULT_HEURISTIC_SAMPLE_SIZE);

            }

        }

 

        #endregion

 

        #region 인코딩 구하기 - GetEncoding(bomByteArray)

 

        /// <summary>

        /// 인코딩 구하기

        /// </summary>

        /// <param name="bomByteArray">BOM 바이트 배열</param>

        /// <returns>인코딩</returns>

        public static Encoding GetEncoding(byte[] bomByteArray)

        {

            if(bomByteArray == null)

            {

                throw new ArgumentNullException("bomByteArray가 null 입니다.", "bomByteArray");

            }

 

            if(bomByteArray.Length < 2)

            {

                return null;

            }

 

            if(bomByteArray[0] == 0xff && bomByteArray[1] == 0xfe && (bomByteArray.Length < 4 || bomByteArray[2] != 0 ||

                bomByteArray[3] != 0))

            {

                return Encoding.Unicode;

            }

 

            if(bomByteArray[0] == 0xfe && bomByteArray[1] == 0xff)

            {

                return Encoding.BigEndianUnicode;

            }

 

            if(bomByteArray.Length < 3)

            {

                return null;

            }

 

            if(bomByteArray[0] == 0xef && bomByteArray[1] == 0xbb && bomByteArray[2] == 0xbf)

            {

                return Encoding.UTF8;

            }

 

            if(bomByteArray[0] == 0x2b && bomByteArray[1] == 0x2f && bomByteArray[2] == 0x76)

            {

                return Encoding.UTF7;

            }

 

            if(bomByteArray.Length < 4)

            {

                return null;

            }

 

            if(bomByteArray[0] == 0xff && bomByteArray[1] == 0xfe && bomByteArray[2] == 0 && bomByteArray[3] == 0)

            {

                return Encoding.UTF32;

            }

 

            if(bomByteArray[0] == 0 && bomByteArray[1] == 0 && bomByteArray[2] == 0xfe && bomByteArray[3] == 0xff)

            {

                return Encoding.GetEncoding(12001);

            }

 

            return null;

        }

 

        #endregion

        #region 휴리스틱으로 유니코드 인코딩 구하기 - GetUnicodeEncodingByHeuristics(sampleByteArray)

 

        /// <summary>

        /// 휴리스틱으로 유니코드 인코딩 구하기

        /// </summary>

        /// <param name="sampleByteArray">샘플 바이트 배열</param>

        /// <returns>유니코드 인코딩</returns>

        public static Encoding GetUnicodeEncodingByHeuristics(byte[] sampleByteArray)

        {

            long oddBinaryNullCountInSample     = 0;

            long evenBinaryNullCountInSample    = 0;

            long suspiciousUTF8SequenceCount    = 0;

            long suspiciousUTF8ByteTotalCount   = 0;

            long likelyUSASCIIByteCountInSample = 0;

 

            long currentPosition   = 0;

            int  skipUTF8ByteCount = 0;

 

            while(currentPosition < sampleByteArray.Length)

            {

                if(sampleByteArray[currentPosition] == 0)

                {

                    if(currentPosition % 2 == 0)

                    {

                        evenBinaryNullCountInSample++;

                    }

                    else

                    {

                        oddBinaryNullCountInSample++;

                    }

                }

 

                if(IsCommonUSASCIIByte(sampleByteArray[currentPosition]))

                {

                    likelyUSASCIIByteCountInSample++;

                }

                    

                if(skipUTF8ByteCount == 0)

                {

                    int lengthFound = GetSuspiciousUTF8SequenceLength(sampleByteArray, currentPosition);

 

                    if (lengthFound > 0)

                    {

                        suspiciousUTF8SequenceCount++;

                        suspiciousUTF8ByteTotalCount += lengthFound;

                        skipUTF8ByteCount = lengthFound - 1;

                    }

                }

                else

                {

                    skipUTF8ByteCount--;

                }

 

                currentPosition++;

            }

 

            if(((evenBinaryNullCountInSample * 2.0) / sampleByteArray.Length) < 0.2 && ((oddBinaryNullCountInSample * 2.0) /

                sampleByteArray.Length) > 0.6)

            {

                return Encoding.Unicode;

            }

 

            if(((oddBinaryNullCountInSample * 2.0) / sampleByteArray.Length) < 0.2 && ((evenBinaryNullCountInSample * 2.0) /

                sampleByteArray.Length) > 0.6)

            {

                return Encoding.BigEndianUnicode;

            }

 

            string potentiallyMangledString = Encoding.ASCII.GetString(sampleByteArray);

 

            Regex utf8ValidatorRegex = new Regex

            (

                @"\A("                                +

                @"[\x09\x0A\x0D\x20-\x7E]"            +

                @"|[\xC2-\xDF][\x80-\xBF]"            +

                @"|\xE0[\xA0-\xBF][\x80-\xBF]"        +

                @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}" +

                @"|\xED[\x80-\x9F][\x80-\xBF]"        +

                @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"     +

                @"|[\xF1-\xF3][\x80-\xBF]{3}"         +

                @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"     +

                @")*\z"

            );

 

            if(utf8ValidatorRegex.IsMatch(potentiallyMangledString))

            {

                if((suspiciousUTF8SequenceCount * 500000.0 / sampleByteArray.Length >= 1) &&

                    (sampleByteArray.Length - suspiciousUTF8ByteTotalCount == 0 || likelyUSASCIIByteCountInSample * 1.0 /

                        (sampleByteArray.Length - suspiciousUTF8ByteTotalCount) >= 0.8))

                {

                    return Encoding.UTF8;

                }

            }

 

            return null;

        }

 

        #endregion

        #region 텍스트 바이트 배열 인코딩 구하기 - GetTextByteArrayEncoding(textByteArray, hasBOM)

 

        /// <summary>

        /// 텍스트 바이트 배열 인코딩 구하기

        /// </summary>

        /// <param name="textByteArray">텍스트 바이트 배열</param>

        /// <param name="hasBOM">BOM 존재 여부</param>

        /// <returns>텍스트 바이트 배열 인코딩</returns>

        public static Encoding GetTextByteArrayEncoding(byte[] textByteArray, out bool hasBOM)

        {

            if(textByteArray == null)

            {

                throw new ArgumentNullException("textByteArray가 null 입니다.", "textByteArray");

            }

 

            Encoding encodingFound = null;

 

            encodingFound = GetEncoding(textByteArray);

 

            if(encodingFound != null)

            {

                hasBOM = true;

 

                return encodingFound;

            }

            else

            {

                encodingFound = GetUnicodeEncodingByHeuristics(textByteArray);

 

                hasBOM = false;

 

                return encodingFound;

            }

        }

 

        #endregion

        #region 텍스트 바이트 배열 인코딩 구하기 - GetTextByteArrayEncoding(textByteArray)

 

        /// <summary>

        /// 텍스트 바이트 배열 인코딩 구하기

        /// </summary>

        /// <param name="textByteArray">텍스트 바이트 배열</param>

        /// <returns>텍스트 바이트 배열 인코딩</returns>

        public static Encoding GetTextByteArrayEncoding(byte[] textByteArray)

        {

            bool useless = false;

 

            return GetTextByteArrayEncoding(textByteArray, out useless);

        }

 

        #endregion

 

        #region 바이트 배열에서 문자열 구하기 - GetStringFromByteArray(textByteArray, defaultEncoding, maximumHeuristicSampleSize)

 

        /// <summary>

        /// 바이트 배열에서 문자열 구하기

        /// </summary>

        /// <param name="textByteArray">텍스트 바이트 배열</param>

        /// <param name="defaultEncoding">디폴트 인코딩</param>

        /// <param name="maximumHeuristicSampleSize">최대 휴리스틱 샘플 크기</param>

        /// <returns>문자열</returns>

        public static string GetStringFromByteArray(byte[] textByteArray, Encoding defaultEncoding, long maximumHeuristicSampleSize)

        {

            if(textByteArray == null)

            {

                throw new ArgumentNullException("textByteArray가 null 입니다.", "textByteArray");

            }

 

            Encoding encodingFound = null;

 

            encodingFound = GetEncoding(textByteArray);

 

            if(encodingFound != null)

            {

                return encodingFound.GetString

                (

                    textByteArray,

                    encodingFound.GetPreamble().Length,

                    textByteArray.Length - encodingFound.GetPreamble().Length

                );

            }

            else

            {

                byte[] heuristicSampleByteArray = null;

 

                if(textByteArray.Length > maximumHeuristicSampleSize)

                {

                    heuristicSampleByteArray = new byte[maximumHeuristicSampleSize];

 

                    Array.Copy(textByteArray, heuristicSampleByteArray, maximumHeuristicSampleSize);

                }

                else

                {

                    heuristicSampleByteArray = textByteArray;

                }

 

                encodingFound = GetUnicodeEncodingByHeuristics(textByteArray) ?? defaultEncoding;

 

                return encodingFound.GetString(textByteArray);

            }

        }

 

        #endregion

        #region 바이트 배열에서 문자열 구하기 - GetStringFromByteArray(textByteArray, defaultEncoding)

 

        /// <summary>

        /// 바이트 배열에서 문자열 구하기

        /// </summary>

        /// <param name="textByteArray">텍스트 바이트 배열</param>

        /// <param name="defaultEncoding">디폴트 인코딩</param>

        /// <returns>문자열</returns>

        public static string GetStringFromByteArray(byte[] textByteArray, Encoding defaultEncoding)

        {

            return GetStringFromByteArray(textByteArray, defaultEncoding, DEFAULT_HEURISTIC_SAMPLE_SIZE);

        }

 

        #endregion

 

        //////////////////////////////////////////////////////////////////////////////// Private

 

        #region 일반 US-ASCII 바이트 여부 구하기 - IsCommonUSASCIIByte(testByte)

 

        /// <summary>

        /// 일반 US-ASCII 바이트 여부 구하기

        /// </summary>

        /// <param name="testByte">테스트 바이트</param>

        /// <returns>일반 US-ASCII 바이트 여부</returns>

        private static bool IsCommonUSASCIIByte(byte testByte)

        {

            if(testByte == 0x0A || // LF

               testByte == 0x0D || // CR

               testByte == 0x09 || // TAB

               (testByte >= 0x20 && testByte <= 0x2F) || // Common Punctuation

               (testByte >= 0x30 && testByte <= 0x39) || // 숫자

               (testByte >= 0x3A && testByte <= 0x40) || // Common Punctuation

               (testByte >= 0x41 && testByte <= 0x5A) || // 대문자

               (testByte >= 0x5B && testByte <= 0x60) || // Common Punctuation

               (testByte >= 0x61 && testByte <= 0x7A) || // 소문자

               (testByte >= 0x7B && testByte <= 0x7E))   // Common Punctuation

            {

                return true;

            }

            else

            {

                return false;

            }

        }

 

        #endregion

        #region 추정 UTF8 시퀀스 길이 구하기 - GetSuspiciousUTF8SequenceLength(sampleByteArray, currentPosition)

 

        /// <summary>

        /// 추정 UTF8 시퀀스 길이 구하기

        /// </summary>

        /// <param name="sampleByteArray">샘플 바이트 배열</param>

        /// <param name="currentPosition">현재 위치</param>

        /// <returns>추정 UTF8 시퀀스 길이</returns>

        private static int GetSuspiciousUTF8SequenceLength(byte[] sampleByteArray, long currentPosition)

        {

            int lengthFound = 0;

 

            if(sampleByteArray.Length > currentPosition + 1 && sampleByteArray[currentPosition] == 0xC2)

            {

                if(sampleByteArray[currentPosition + 1] == 0x81 || sampleByteArray[currentPosition + 1] == 0x8D ||

                    sampleByteArray[currentPosition + 1] == 0x8F)

                {

                    lengthFound = 2;

                }

                else if(sampleByteArray[currentPosition + 1] == 0x90 || sampleByteArray[currentPosition + 1] == 0x9D)

                {

                    lengthFound = 2;

                }

                else if(sampleByteArray[currentPosition + 1] >= 0xA0 && sampleByteArray[currentPosition + 1] <= 0xBF)

                {

                    lengthFound = 2;

                }

            }

            else if(sampleByteArray.Length > currentPosition + 1 && sampleByteArray[currentPosition] == 0xC3)

            {

                if(sampleByteArray[currentPosition + 1] >= 0x80 && sampleByteArray[currentPosition + 1] <= 0xBF)

                {

                    lengthFound = 2;

                }

            }

            else if(sampleByteArray.Length > currentPosition + 1 && sampleByteArray[currentPosition] == 0xC5)

            {

                if(sampleByteArray[currentPosition + 1] == 0x92 || sampleByteArray[currentPosition + 1] == 0x93)

                {

                    lengthFound = 2;

                }

                else if(sampleByteArray[currentPosition + 1] == 0xA0 || sampleByteArray[currentPosition + 1] == 0xA1)

                {

                    lengthFound = 2;

                }

                else if(sampleByteArray[currentPosition + 1] == 0xB8 || sampleByteArray[currentPosition + 1] == 0xBD ||

                    sampleByteArray[currentPosition + 1] == 0xBE)

                {

                    lengthFound = 2;

                }

            }

            else if(sampleByteArray.Length > currentPosition + 1 && sampleByteArray[currentPosition] == 0xC6)

            {

                if(sampleByteArray[currentPosition + 1] == 0x92)

                {

                    lengthFound = 2;

                }

            }

            else if(sampleByteArray.Length > currentPosition + 1 && sampleByteArray[currentPosition] == 0xCB)

            {

                if(sampleByteArray[currentPosition + 1] == 0x86 || sampleByteArray[currentPosition + 1] == 0x9C)

                {

                    lengthFound = 2;

                }

            }

            else if(sampleByteArray.Length > currentPosition + 2 && sampleByteArray[currentPosition] == 0xE2)

            {

                if(sampleByteArray[currentPosition + 1] == 0x80)

                {

                    if(sampleByteArray[currentPosition + 2] == 0x93 || sampleByteArray[currentPosition + 2] == 0x94)

                    {

                        lengthFound = 3;

                    }

 

                    if(sampleByteArray[currentPosition + 2] == 0x98 || sampleByteArray[currentPosition + 2] == 0x99 ||

                        sampleByteArray[currentPosition + 2] == 0x9A)

                    {

                        lengthFound = 3;

                    }

 

                    if(sampleByteArray[currentPosition + 2] == 0x9C || sampleByteArray[currentPosition + 2] == 0x9D ||

                        sampleByteArray[currentPosition + 2] == 0x9E)

                    {

                        lengthFound = 3;

                    }

 

                    if(sampleByteArray[currentPosition + 2] == 0xA0 || sampleByteArray[currentPosition + 2] == 0xA1 ||

                        sampleByteArray[currentPosition + 2] == 0xA2)

                    {

                        lengthFound = 3;

                    }

 

                    if(sampleByteArray[currentPosition + 2] == 0xA6)

                    {

                        lengthFound = 3;

                    }

 

                    if(sampleByteArray[currentPosition + 2] == 0xB0)

                    {

                        lengthFound = 3;

                    }

 

                    if(sampleByteArray[currentPosition + 2] == 0xB9 || sampleByteArray[currentPosition + 2] == 0xBA)

                    {

                        lengthFound = 3;

                    }

                }

                else if(sampleByteArray[currentPosition + 1] == 0x82 && sampleByteArray[currentPosition + 2] == 0xAC)

                {

                    lengthFound = 3;

                }

                else if(sampleByteArray[currentPosition + 1] == 0x84 && sampleByteArray[currentPosition + 2] == 0xA2)

                {

                    lengthFound = 3;

                }

            }

 

            return lengthFound;

        }

 

        #endregion

    }

}

 

------------------------------------------------------------------------------------------------------------------------

Posted by 사용자 icodebroker
TAG , ,

댓글을 달아 주세요