How to convert file to utf-8 in Python?

I need to convert a bunch of files in utf-8 to Python, and I'm having problems with the "file conversion" part.

I would like to make an equivalent:

iconv -t utf-8 $file > converted/$file # this is shell code 

Thank!

+48
python file encoding utf-8
Oct 10 '08 at 13:50
source share
7 answers

You can use the codec module , for example:

 import codecs BLOCKSIZE = 1048576 # or some other, desired size in bytes with codecs.open(sourceFileName, "r", "your-source-encoding") as sourceFile: with codecs.open(targetFileName, "w", "utf-8") as targetFile: while True: contents = sourceFile.read(BLOCKSIZE) if not contents: break targetFile.write(contents) 

EDIT : Added BLOCKSIZE parameter to control file size.

+46
Oct 10 '08 at 13:59
source share

This worked for me in a little test:

 sourceEncoding = "iso-8859-1" targetEncoding = "utf-8" source = open("source") target = open("target", "w") target.write(unicode(source.read(), sourceEncoding).encode(targetEncoding)) 
+27
Oct 10 '08 at 14:07
source share

Thanks for the answers, it works!

And since the source files are in mixed formats, I added a list of source formats that will be checked sequentially ( sourceFormats ), and in UnicodeDecodeError I will try the following format:

 from __future__ import with_statement import os import sys import codecs from chardet.universaldetector import UniversalDetector targetFormat = 'utf-8' outputDir = 'converted' detector = UniversalDetector() def get_encoding_type(current_file): detector.reset() for line in file(current_file): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding'] def convertFileBestGuess(filename): sourceFormats = ['ascii', 'iso-8859-1'] for format in sourceFormats: try: with codecs.open(fileName, 'rU', format) as sourceFile: writeConversion(sourceFile) print('Done.') return except UnicodeDecodeError: pass def convertFileWithDetection(fileName): print("Converting '" + fileName + "'...") format=get_encoding_type(fileName) try: with codecs.open(fileName, 'rU', format) as sourceFile: writeConversion(sourceFile) print('Done.') return except UnicodeDecodeError: pass print("Error: failed to convert '" + fileName + "'.") def writeConversion(file): with codecs.open(outputDir + '/' + fileName, 'w', targetFormat) as targetFile: for line in file: targetFile.write(line) # Off topic: get the file list and call convertFile on each file # ... 

(EDIT by Rudro Badhon: this includes an original attempt at several formats until you get an exception, as well as an alternative approach using chardet.universaldetector)

+13
Oct 10 '08 at 16:14
source share

To guess what source encoding you can use the file * nix command.

Example:

 $ file --mime jumper.xml jumper.xml: application/xml; charset=utf-8 
+1
Feb 08 2018-12-12T00:
source share

This is a Python3 function for converting any text file to UTF-8 encoding. (without using unnecessary packages)

 def correctSubtitleEncoding(filename, newFilename, encoding_from, encoding_to='UTF-8'): with open(filename, 'r', encoding=encoding_from) as fr: with open(newFilename, 'w', encoding=encoding_to) as fw: for line in fr: fw.write(line[:-1]+'\r\n') 

You can easily use it in a loop to convert a list of files.

+1
Jan 08 '17 at 17:58
source share

This is my brute force method. It also takes care of mixed \ n and \ r \ n in the input.

  # open the CSV file inputfile = open(filelocation, 'rb') outputfile = open(outputfilelocation, 'w', encoding='utf-8') for line in inputfile: if line[-2:] == b'\r\n' or line[-2:] == b'\n\r': output = line[:-2].decode('utf-8', 'replace') + '\n' elif line[-1:] == b'\r' or line[-1:] == b'\n': output = line[:-1].decode('utf-8', 'replace') + '\n' else: output = line.decode('utf-8', 'replace') + '\n' outputfile.write(output) outputfile.close() except BaseException as error: cfg.log(self.outf, "Error(18): opening CSV-file " + filelocation + " failed: " + str(error)) self.loadedwitherrors = 1 return ([]) try: # open the CSV-file of this source table csvreader = csv.reader(open(outputfilelocation, "rU"), delimiter=delimitervalue, quoting=quotevalue, dialect=csv.excel_tab) except BaseException as error: cfg.log(self.outf, "Error(19): reading CSV-file " + filelocation + " failed: " + str(error)) 
0
Nov 30 '18 at 7:35
source share

Answer for unknown source encoding type

Based on @ Sebastian Rocca Serra

python3.6

 import os from chardet import detect # get file encoding type def get_encoding_type(file): with open(file, 'rb') as f: rawdata = f.read() return detect(rawdata)['encoding'] from_codec = get_encoding_type(srcfile) # add try: except block for reliability try: with open(srcfile, 'r', encoding=from_codec) as f, open(trgfile, 'w', encoding='utf-8') as e: text = f.read() # for small files, for big use chunks e.write(text) os.remove(srcfile) # remove old encoding file os.rename(trgfile, srcfile) # rename new encoding except UnicodeDecodeError: print('Decode Error') except UnicodeEncodeError: print('Encode Error') 
0
Dec 19 '18 at 12:59
source share



All Articles