go: Legacy character encoding conversion to UTF-8

Go language is using UTF-8 encoding, but sometimes text files are encoded in some other encoding, like Windows-1250. Here is example, how it could be done https://play.golang.org/p/K3JAeM4nCVD

Decoding table used in this program was developed from page CP1250.TXT at ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/ Instead of undefined characters is used question-mark. Decoding algorithm is decoding bytes in byte slice into rune slice of the same size, and rune slice is then converted into string. File related functions win1250BytesToString and checkError are not utilized here at online go play site. It’s possible to develop decoding table for other code pages CP1251.TXT, CP1252.TXT, CP1253.TXT, CP1254.TXT, CP1255.TXT, CP1256.TXT, CP1257.TXT, CP1258.TXT, CP874.TXT, CP932.TXT, CP936.TXT, CP949.TXT, CP950.TXT.

package main

import (
	"fmt"
	"io/ioutil"
)

// runeWindows1250  is based on characters table at ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
var runeWindows1250 = [...]rune{
	'\u0000', // 0x00 NULL
	'\u0001', // 0x01 START OF HEADING
	'\u0002', // 0x02 START OF TEXT
	'\u0003', // 0x03 END OF TEXT
	'\u0004', // 0x04 END OF TRANSMISSION
	'\u0005', // 0x05 ENQUIRY
	'\u0006', // 0x06 ACKNOWLEDGE
	'\u0007', // 0x07 BELL
	'\u0008', // 0x08 BACKSPACE
	'\u0009', // 0x09 HORIZONTAL TABULATION
	'\u000A', // 0x0A LINE FEED
	'\u000B', // 0x0B VERTICAL TABULATION
	'\u000C', // 0x0C FORM FEED
	'\u000D', // 0x0D CARRIAGE RETURN
	'\u000E', // 0x0E SHIFT OUT
	'\u000F', // 0x0F SHIFT IN
	'\u0010', // 0x10 DATA LINK ESCAPE
	'\u0011', // 0x11 DEVICE CONTROL ONE
	'\u0012', // 0x12 DEVICE CONTROL TWO
	'\u0013', // 0x13 DEVICE CONTROL THREE
	'\u0014', // 0x14 DEVICE CONTROL FOUR
	'\u0015', // 0x15 NEGATIVE ACKNOWLEDGE
	'\u0016', // 0x16 SYNCHRONOUS IDLE
	'\u0017', // 0x17 END OF TRANSMISSION BLOCK
	'\u0018', // 0x18 CANCEL
	'\u0019', // 0x19 END OF MEDIUM
	'\u001A', // 0x1A SUBSTITUTE
	'\u001B', // 0x1B ESCAPE
	'\u001C', // 0x1C FILE SEPARATOR
	'\u001D', // 0x1D GROUP SEPARATOR
	'\u001E', // 0x1E RECORD SEPARATOR
	'\u001F', // 0x1F UNIT SEPARATOR
	'\u0020', // 0x20 SPACE
	'\u0021', // 0x21 EXCLAMATION MARK
	'\u0022', // 0x22 QUOTATION MARK
	'\u0023', // 0x23 NUMBER SIGN
	'\u0024', // 0x24 DOLLAR SIGN
	'\u0025', // 0x25 PERCENT SIGN
	'\u0026', // 0x26 AMPERSAND
	'\u0027', // 0x27 APOSTROPHE
	'\u0028', // 0x28 LEFT PARENTHESIS
	'\u0029', // 0x29 RIGHT PARENTHESIS
	'\u002A', // 0x2A ASTERISK
	'\u002B', // 0x2B PLUS SIGN
	'\u002C', // 0x2C COMMA
	'\u002D', // 0x2D HYPHEN-MINUS
	'\u002E', // 0x2E FULL STOP
	'\u002F', // 0x2F SOLIDUS
	'\u0030', // 0x30 DIGIT ZERO
	'\u0031', // 0x31 DIGIT ONE
	'\u0032', // 0x32 DIGIT TWO
	'\u0033', // 0x33 DIGIT THREE
	'\u0034', // 0x34 DIGIT FOUR
	'\u0035', // 0x35 DIGIT FIVE
	'\u0036', // 0x36 DIGIT SIX
	'\u0037', // 0x37 DIGIT SEVEN
	'\u0038', // 0x38 DIGIT EIGHT
	'\u0039', // 0x39 DIGIT NINE
	'\u003A', // 0x3A COLON
	'\u003B', // 0x3B SEMICOLON
	'\u003C', // 0x3C LESS-THAN SIGN
	'\u003D', // 0x3D EQUALS SIGN
	'\u003E', // 0x3E GREATER-THAN SIGN
	'\u003F', // 0x3F QUESTION MARK
	'\u0040', // 0x40 COMMERCIAL AT
	'\u0041', // 0x41 LATIN CAPITAL LETTER A
	'\u0042', // 0x42 LATIN CAPITAL LETTER B
	'\u0043', // 0x43 LATIN CAPITAL LETTER C
	'\u0044', // 0x44 LATIN CAPITAL LETTER D
	'\u0045', // 0x45 LATIN CAPITAL LETTER E
	'\u0046', // 0x46 LATIN CAPITAL LETTER F
	'\u0047', // 0x47 LATIN CAPITAL LETTER G
	'\u0048', // 0x48 LATIN CAPITAL LETTER H
	'\u0049', // 0x49 LATIN CAPITAL LETTER I
	'\u004A', // 0x4A LATIN CAPITAL LETTER J
	'\u004B', // 0x4B LATIN CAPITAL LETTER K
	'\u004C', // 0x4C LATIN CAPITAL LETTER L
	'\u004D', // 0x4D LATIN CAPITAL LETTER M
	'\u004E', // 0x4E LATIN CAPITAL LETTER N
	'\u004F', // 0x4F LATIN CAPITAL LETTER O
	'\u0050', // 0x50 LATIN CAPITAL LETTER P
	'\u0051', // 0x51 LATIN CAPITAL LETTER Q
	'\u0052', // 0x52 LATIN CAPITAL LETTER R
	'\u0053', // 0x53 LATIN CAPITAL LETTER S
	'\u0054', // 0x54 LATIN CAPITAL LETTER T
	'\u0055', // 0x55 LATIN CAPITAL LETTER U
	'\u0056', // 0x56 LATIN CAPITAL LETTER V
	'\u0057', // 0x57 LATIN CAPITAL LETTER W
	'\u0058', // 0x58 LATIN CAPITAL LETTER X
	'\u0059', // 0x59 LATIN CAPITAL LETTER Y
	'\u005A', // 0x5A LATIN CAPITAL LETTER Z
	'\u005B', // 0x5B LEFT SQUARE BRACKET
	'\u005C', // 0x5C REVERSE SOLIDUS
	'\u005D', // 0x5D RIGHT SQUARE BRACKET
	'\u005E', // 0x5E CIRCUMFLEX ACCENT
	'\u005F', // 0x5F LOW LINE
	'\u0060', // 0x60 GRAVE ACCENT
	'\u0061', // 0x61 LATIN SMALL LETTER A
	'\u0062', // 0x62 LATIN SMALL LETTER B
	'\u0063', // 0x63 LATIN SMALL LETTER C
	'\u0064', // 0x64 LATIN SMALL LETTER D
	'\u0065', // 0x65 LATIN SMALL LETTER E
	'\u0066', // 0x66 LATIN SMALL LETTER F
	'\u0067', // 0x67 LATIN SMALL LETTER G
	'\u0068', // 0x68 LATIN SMALL LETTER H
	'\u0069', // 0x69 LATIN SMALL LETTER I
	'\u006A', // 0x6A LATIN SMALL LETTER J
	'\u006B', // 0x6B LATIN SMALL LETTER K
	'\u006C', // 0x6C LATIN SMALL LETTER L
	'\u006D', // 0x6D LATIN SMALL LETTER M
	'\u006E', // 0x6E LATIN SMALL LETTER N
	'\u006F', // 0x6F LATIN SMALL LETTER O
	'\u0070', // 0x70 LATIN SMALL LETTER P
	'\u0071', // 0x71 LATIN SMALL LETTER Q
	'\u0072', // 0x72 LATIN SMALL LETTER R
	'\u0073', // 0x73 LATIN SMALL LETTER S
	'\u0074', // 0x74 LATIN SMALL LETTER T
	'\u0075', // 0x75 LATIN SMALL LETTER U
	'\u0076', // 0x76 LATIN SMALL LETTER V
	'\u0077', // 0x77 LATIN SMALL LETTER W
	'\u0078', // 0x78 LATIN SMALL LETTER X
	'\u0079', // 0x79 LATIN SMALL LETTER Y
	'\u007A', // 0x7A LATIN SMALL LETTER Z
	'\u007B', // 0x7B LEFT CURLY BRACKET
	'\u007C', // 0x7C VERTICAL LINE
	'\u007D', // 0x7D RIGHT CURLY BRACKET
	'\u007E', // 0x7E TILDE
	'\u007F', // 0x7F DELETE
	'\u20AC', // 0x80 EURO SIGN
	'\u003F', // 0x81 UNDEFINED ?
	'\u201A', // 0x82 SINGLE LOW-9 QUOTATION MARK
	'\u003F', // 0x83 UNDEFINED ?
	'\u201E', // 0x84 DOUBLE LOW-9 QUOTATION MARK
	'\u2026', // 0x85 HORIZONTAL ELLIPSIS
	'\u2020', // 0x86 DAGGER
	'\u2021', // 0x87 DOUBLE DAGGER
	'\u003F', // 0x88 UNDEFINED ?
	'\u2030', // 0x89 PER MILLE SIGN
	'\u0160', // 0x8A LATIN CAPITAL LETTER S WITH CARON
	'\u2039', // 0x8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK
	'\u015A', // 0x8C LATIN CAPITAL LETTER S WITH ACUTE
	'\u0164', // 0x8D LATIN CAPITAL LETTER T WITH CARON
	'\u017D', // 0x8E LATIN CAPITAL LETTER Z WITH CARON
	'\u0179', // 0x8F LATIN CAPITAL LETTER Z WITH ACUTE
	'\u003F', // 0x90 UNDEFINED ?
	'\u2018', // 0x91 LEFT SINGLE QUOTATION MARK
	'\u2019', // 0x92 RIGHT SINGLE QUOTATION MARK
	'\u201C', // 0x93 LEFT DOUBLE QUOTATION MARK
	'\u201D', // 0x94 RIGHT DOUBLE QUOTATION MARK
	'\u2022', // 0x95 BULLET
	'\u2013', // 0x96 EN DASH
	'\u2014', // 0x97 EM DASH
	'\u003F', // 0x98 UNDEFINED ?
	'\u2122', // 0x99 TRADE MARK SIGN
	'\u0161', // 0x9A LATIN SMALL LETTER S WITH CARON
	'\u203A', // 0x9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
	'\u015B', // 0x9C LATIN SMALL LETTER S WITH ACUTE
	'\u0165', // 0x9D LATIN SMALL LETTER T WITH CARON
	'\u017E', // 0x9E LATIN SMALL LETTER Z WITH CARON
	'\u017A', // 0x9F LATIN SMALL LETTER Z WITH ACUTE
	'\u00A0', // 0xA0 NO-BREAK SPACE
	'\u02C7', // 0xA1 CARON
	'\u02D8', // 0xA2 BREVE
	'\u0141', // 0xA3 LATIN CAPITAL LETTER L WITH STROKE
	'\u00A4', // 0xA4 CURRENCY SIGN
	'\u0104', // 0xA5 LATIN CAPITAL LETTER A WITH OGONEK
	'\u00A6', // 0xA6 BROKEN BAR
	'\u00A7', // 0xA7 SECTION SIGN
	'\u00A8', // 0xA8 DIAERESIS
	'\u00A9', // 0xA9 COPYRIGHT SIGN
	'\u015E', // 0xAA LATIN CAPITAL LETTER S WITH CEDILLA
	'\u00AB', // 0xAB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
	'\u00AC', // 0xAC NOT SIGN
	'\u00AD', // 0xAD SOFT HYPHEN
	'\u00AE', // 0xAE REGISTERED SIGN
	'\u017B', // 0xAF LATIN CAPITAL LETTER Z WITH DOT ABOVE
	'\u00B0', // 0xB0 DEGREE SIGN
	'\u00B1', // 0xB1 PLUS-MINUS SIGN
	'\u02DB', // 0xB2 OGONEK
	'\u0142', // 0xB3 LATIN SMALL LETTER L WITH STROKE
	'\u00B4', // 0xB4 ACUTE ACCENT
	'\u00B5', // 0xB5 MICRO SIGN
	'\u00B6', // 0xB6 PILCROW SIGN
	'\u00B7', // 0xB7 MIDDLE DOT
	'\u00B8', // 0xB8 CEDILLA
	'\u0105', // 0xB9 LATIN SMALL LETTER A WITH OGONEK
	'\u015F', // 0xBA LATIN SMALL LETTER S WITH CEDILLA
	'\u00BB', // 0xBB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
	'\u013D', // 0xBC LATIN CAPITAL LETTER L WITH CARON
	'\u02DD', // 0xBD DOUBLE ACUTE ACCENT
	'\u013E', // 0xBE LATIN SMALL LETTER L WITH CARON
	'\u017C', // 0xBF LATIN SMALL LETTER Z WITH DOT ABOVE
	'\u0154', // 0xC0 LATIN CAPITAL LETTER R WITH ACUTE
	'\u00C1', // 0xC1 LATIN CAPITAL LETTER A WITH ACUTE
	'\u00C2', // 0xC2 LATIN CAPITAL LETTER A WITH CIRCUMFLEX
	'\u0102', // 0xC3 LATIN CAPITAL LETTER A WITH BREVE
	'\u00C4', // 0xC4 LATIN CAPITAL LETTER A WITH DIAERESIS
	'\u0139', // 0xC5 LATIN CAPITAL LETTER L WITH ACUTE
	'\u0106', // 0xC6 LATIN CAPITAL LETTER C WITH ACUTE
	'\u00C7', // 0xC7 LATIN CAPITAL LETTER C WITH CEDILLA
	'\u010C', // 0xC8 LATIN CAPITAL LETTER C WITH CARON
	'\u00C9', // 0xC9 LATIN CAPITAL LETTER E WITH ACUTE
	'\u0118', // 0xCA LATIN CAPITAL LETTER E WITH OGONEK
	'\u00CB', // 0xCB LATIN CAPITAL LETTER E WITH DIAERESIS
	'\u011A', // 0xCC LATIN CAPITAL LETTER E WITH CARON
	'\u00CD', // 0xCD LATIN CAPITAL LETTER I WITH ACUTE
	'\u00CE', // 0xCE LATIN CAPITAL LETTER I WITH CIRCUMFLEX
	'\u010E', // 0xCF LATIN CAPITAL LETTER D WITH CARON
	'\u0110', // 0xD0 LATIN CAPITAL LETTER D WITH STROKE
	'\u0143', // 0xD1 LATIN CAPITAL LETTER N WITH ACUTE
	'\u0147', // 0xD2 LATIN CAPITAL LETTER N WITH CARON
	'\u00D3', // 0xD3 LATIN CAPITAL LETTER O WITH ACUTE
	'\u00D4', // 0xD4 LATIN CAPITAL LETTER O WITH CIRCUMFLEX
	'\u0150', // 0xD5 LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
	'\u00D6', // 0xD6 LATIN CAPITAL LETTER O WITH DIAERESIS
	'\u00D7', // 0xD7 MULTIPLICATION SIGN
	'\u0158', // 0xD8 LATIN CAPITAL LETTER R WITH CARON
	'\u016E', // 0xD9 LATIN CAPITAL LETTER U WITH RING ABOVE
	'\u00DA', // 0xDA LATIN CAPITAL LETTER U WITH ACUTE
	'\u0170', // 0xDB LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
	'\u00DC', // 0xDC LATIN CAPITAL LETTER U WITH DIAERESIS
	'\u00DD', // 0xDD LATIN CAPITAL LETTER Y WITH ACUTE
	'\u0162', // 0xDE LATIN CAPITAL LETTER T WITH CEDILLA
	'\u00DF', // 0xDF LATIN SMALL LETTER SHARP S
	'\u0155', // 0xE0 LATIN SMALL LETTER R WITH ACUTE
	'\u00E1', // 0xE1 LATIN SMALL LETTER A WITH ACUTE
	'\u00E2', // 0xE2 LATIN SMALL LETTER A WITH CIRCUMFLEX
	'\u0103', // 0xE3 LATIN SMALL LETTER A WITH BREVE
	'\u00E4', // 0xE4 LATIN SMALL LETTER A WITH DIAERESIS
	'\u013A', // 0xE5 LATIN SMALL LETTER L WITH ACUTE
	'\u0107', // 0xE6 LATIN SMALL LETTER C WITH ACUTE
	'\u00E7', // 0xE7 LATIN SMALL LETTER C WITH CEDILLA
	'\u010D', // 0xE8 LATIN SMALL LETTER C WITH CARON
	'\u00E9', // 0xE9 LATIN SMALL LETTER E WITH ACUTE
	'\u0119', // 0xEA LATIN SMALL LETTER E WITH OGONEK
	'\u00EB', // 0xEB LATIN SMALL LETTER E WITH DIAERESIS
	'\u011B', // 0xEC LATIN SMALL LETTER E WITH CARON
	'\u00ED', // 0xED LATIN SMALL LETTER I WITH ACUTE
	'\u00EE', // 0xEE LATIN SMALL LETTER I WITH CIRCUMFLEX
	'\u010F', // 0xEF LATIN SMALL LETTER D WITH CARON
	'\u0111', // 0xF0 LATIN SMALL LETTER D WITH STROKE
	'\u0144', // 0xF1 LATIN SMALL LETTER N WITH ACUTE
	'\u0148', // 0xF2 LATIN SMALL LETTER N WITH CARON
	'\u00F3', // 0xF3 LATIN SMALL LETTER O WITH ACUTE
	'\u00F4', // 0xF4 LATIN SMALL LETTER O WITH CIRCUMFLEX
	'\u0151', // 0xF5 LATIN SMALL LETTER O WITH DOUBLE ACUTE
	'\u00F6', // 0xF6 LATIN SMALL LETTER O WITH DIAERESIS
	'\u00F7', // 0xF7 DIVISION SIGN
	'\u0159', // 0xF8 LATIN SMALL LETTER R WITH CARON
	'\u016F', // 0xF9 LATIN SMALL LETTER U WITH RING ABOVE
	'\u00FA', // 0xFA LATIN SMALL LETTER U WITH ACUTE
	'\u0171', // 0xFB LATIN SMALL LETTER U WITH DOUBLE ACUTE
	'\u00FC', // 0xFC LATIN SMALL LETTER U WITH DIAERESIS
	'\u00FD', // 0xFD LATIN SMALL LETTER Y WITH ACUTE
	'\u0163', // 0xFE LATIN SMALL LETTER T WITH CEDILLA
	'\u02D9', // 0xFF DOT ABOVE
}

func win1250BytesToString(bytes []byte) string {
	runes := make([]rune, len(bytes))
	for i, b := range bytes {
		runes[i] = runeWindows1250[b]
	}
	return string(runes)
}

func checkError(e error) {
	if e != nil {
		panic(e)
	}
}

func convertFileWin1250toUTF8(fileFrom string, fileTo string) {
	bytesWin1250, readErr := ioutil.ReadFile(fileFrom)
	checkError(readErr)

	utf8String := win1250BytesToString(bytesWin1250)

	writeErr := ioutil.WriteFile(fileTo, []byte(utf8String), 0644)
	checkError(writeErr)
}

func main() {
	fmt.Println("Converting bytes encoded in Windows-1250 into UTF-8")
	fmt.Println("Sample slice []byte named bytesA has two capital A, space, and various A with accents")
	fmt.Println()
	bytesA := []byte{65, 0x41, 0x20, 0xA5, 0xC1, 0xC2, 0xC3, 0xC4}
	fmt.Printf("bytesA hex values: % x\n", bytesA)
	str := win1250BytesToString(bytesA)
	fmt.Printf("len(bytesA)=%d, len(str)=%d, str=%s\n", len(bytesA), len(str), str)
}
This entry was posted in golang, workday. Bookmark the permalink.

Leave a Reply