Exporter Excel en CSV

**jouclar** · 13/08/2008, 21h58

Bonjour à tous,

J'ai récupéré un bout de script qui convertit un fichier Excel en autant de fichier CSV qu'il y d'onglet dans le fichier Excel. Mais parce qu'il y a souvent un mais, les caractères comme les accents é, è, à sont convertis en caractères spéciaux @... à priori cela provient de la fonction utf8ize mais je suis trop faible en Python pour m'en sortir tout seul, alors merci d'avance de votre aide.

Voici le code :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# -*- coding: ISO-8859-1 -*-
 
import  xlrd
import  re
import  os, sys, os.path
import  csv
 
#
# Some portions based on a recipe by Bryan Niederberger from
# the ASPN Python cookbook, under the Python license:
#   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483742
# This code is released under the Python license as well.
# -- Philip (flip) Kromer flip@infochimp.org
#
 
#
# Read excel sheet into list of 2-d arrays
#
def xlrd_xls2array(infilename):
    """ Returns a list of sheets; each sheet is a dict containing
    * sheet_name: unicode string naming that sheet
    * sheet_data: 2-D table holding the converted cells of that sheet
    """
    book       = xlrd.open_workbook(infilename)
    sheets     = []
    formatter  = lambda(t,v): format_excelval(book,t,v,False)
 
    for sheet_name in book.sheet_names():
        raw_sheet = book.sheet_by_name(sheet_name)
        data      = []
        for row in range(raw_sheet.nrows):
            (types, values) = (raw_sheet.row_types(row), raw_sheet.row_values(row))
            data.append(map(formatter, zip(types, values)))
        sheets.append({ 'sheet_name': sheet_name, 'sheet_data': data })
    return sheets
 
 
def format_excelval(book, type, value, wanttupledate):
    """ Clean up the incoming excel data """
    ##  Data Type Codes:
    ##  EMPTY   0
    ##  TEXT    1 a Unicode string
    ##  NUMBER  2 float
    ##  DATE    3 float
    ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
    ##  ERROR   5
    returnrow = []
    if   type == 2: # TEXT
        if value == int(value): value = int(value)
    elif type == 3: # NUMBER
        datetuple = xlrd.xldate_as_tuple(value, book.datemode)
        value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
    elif type == 5: # ERROR
        value = xlrd.error_text_from_code[value]
    return value
 
#
# Save to CSV
#
 
def camelize(s):
    """Makes a reasonable attempt at turning an arbitrary string
    into an identifier-safe CamelCasedString"""
    h = unicode(s)
    h = re.sub(r'(?:[_\s]+)([a-z])',
               lambda m: m.group(1).upper(), h)
    h = re.sub(r'[\-\.]+', '_', h)
    h = re.sub(r'\W',      '',  h)
    return h
 
def utf8ize(l):
    """Make string-like things into utf-8, leave other things alone
    """
    return [unicode(s).encode("utf-8") if hasattr(s,'encode') else s for s in l]
 
 
def dump_csv(table, outdir, outfilename):
    stream = file(os.path.join(outdir, outfilename), 'wb')
    csvout = csv.writer(stream, delimiter=',', doublequote=False, escapechar='\\')
    csvout.writerows( map(utf8ize, table) )
    stream.close()
 
def save_csv_tables(tables, outdir, outfilebase):
    for (sheet_idx, sheet) in enumerate(tables):
        outfilename = "%s_%d_%s.csv" % (outfilebase, sheet_idx, camelize(sheet['sheet_name']))
        dump_csv(sheet['sheet_data'], outdir, outfilename)
 
#
# Process files listed on command line, or all .xls files in current dir if no
# args given
#
re_excelfilename = re.compile(r'(\.xls)$')
try:    args = sys.argv[1:]
except: args = []
if len(args) < 1:
    infilenames = filter(re_excelfilename.search, os.listdir("."))
    infilenames.sort()
else:
    infilenames = args
 
for infilename in infilenames:
    tables = xlrd_xls2array(infilename)
    (outdir, infilebase) = os.path.split(infilename)
    outfilebase = re_excelfilename.sub('', infilebase)
    save_csv_tables(tables, outdir, outfilebase)
##    dump_yaml      (tables, outdir, outfilebase)

**jouclar** · 18/08/2008, 17h45

Merci quand même. Cela pourra toujours servir à quelqu'un

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
###!/usr/bin/env python
# -*- coding: cp1252 -*-
 
import  xlrd
import  re
import  os, sys, os.path
import  csv, codecs
 
#
# Some portions based on a recipe by Bryan Niederberger from
# the ASPN Python cookbook, under the Python license:
#   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483742
# This code is released under the Python license as well.
# -- Philip (flip) Kromer flip@infochimp.org
#
 
#
# Read excel sheet into list of 2-d arrays
#
def xlrd_xls2array(infilename):
    """ Returns a list of sheets; each sheet is a dict containing
    * sheet_name: unicode string naming that sheet
    * sheet_data: 2-D table holding the converted cells of that sheet
    """
    book       = xlrd.open_workbook(infilename)
    sheets     = []
    formatter  = lambda(t,v): format_excelval(book,t,v,False)
 
    for sheet_name in book.sheet_names():
        raw_sheet = book.sheet_by_name(sheet_name)
        data      = []
        for row in range(raw_sheet.nrows):
            (types, values) = (raw_sheet.row_types(row), raw_sheet.row_values(row))
            data.append(map(formatter, zip(types, values)))
        sheets.append({ 'sheet_name': sheet_name, 'sheet_data': data })
    return sheets
 
##def tupledate_to_isodate(tupledate):
##    """
##    Turns a gregorian (year, month, day, hour, minute, nearest_second) into a
##    standard YYYY-MM-DDTHH:MM:SS ISO date.  If the date part is all zeros, it's
##    assumed to be a time; if the time part is all zeros it's assumed to be a date;
##    if all of it is zeros it's taken to be a time, specifically 00:00:00 (midnight).
##
##    Note that datetimes of midnight will come back as date-only strings.  A date
##    of month=0 and day=0 is meaningless, so that part of the coercion is safe.
##    For more on the hairy nature of Excel date/times see http://www.lexicon.net/sjmachin/xlrd.html
##    """
##    (y,m,d, hh,mm,ss) = tupledate
##    nonzero = lambda n: n!=0
##    date = "%04d-%02d-%02d"  % (y,m,d)    if filter(nonzero, (y,m,d))                else ''
##    time = "T%02d:%02d:%02d" % (hh,mm,ss) if filter(nonzero, (hh,mm,ss)) or not date else ''
##    return date+time
 
def format_excelval(book, type, value, wanttupledate):
    """ Clean up the incoming excel data """
    ##  Data Type Codes:
    ##  EMPTY   0
    ##  TEXT    1 a Unicode string
    ##  NUMBER  2 float
    ##  DATE    3 float
    ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
    ##  ERROR   5
    returnrow = []
    if   type == 2: # TEXT
        if value == int(value): value = int(value)
    elif type == 3: # NUMBER
        datetuple = xlrd.xldate_as_tuple(value, book.datemode)
        value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
    elif type == 5: # ERROR
        value = xlrd.error_text_from_code[value]
    return value
 
#
# Save to CSV
#
 
def camelize(s):
    """Makes a reasonable attempt at turning an arbitrary string
    into an identifier-safe CamelCasedString"""
    h = unicode(s)
    h = re.sub(r'(?:[_\s]+)([a-z])',
               lambda m: m.group(1).upper(), h)
    h = re.sub(r'[\-\.]+', '_', h)
    h = re.sub(r'\W',      '',  h)
    return h
 
def cp1252ize(l):
    """Make string-like things into utf-8, leave other things alone
    """
    return [unicode(s).encode("cp1252") if hasattr(s,'encode') else s for s in l]
 
 
def dump_csv(table, outdir, outfilename):
    stream = file(os.path.join(outdir, outfilename), 'wb')
    csvout = csv.writer(stream, delimiter=';', doublequote=False, escapechar='\\')
    csvout.writerows( map(cp1252ize, table) )
    stream.close()
 
def save_csv_tables(tables, outdir, outfilebase):
    for (sheet_idx, sheet) in enumerate(tables):
        outfilename = "%s_%d_%s.csv" % (outfilebase, sheet_idx, camelize(sheet['sheet_name']))
        dump_csv(sheet['sheet_data'], outdir, outfilename)
 
#
# Process files listed on command line, or all .xls files in current dir if no
# args given
#
re_excelfilename = re.compile(r'(\.xls)$')
try:    args = sys.argv[1:]
except: args = []
if len(args) < 1:
    infilenames = filter(re_excelfilename.search, os.listdir("."))
    infilenames.sort()
else:
    infilenames = args
 
for infilename in infilenames:
    tables = xlrd_xls2array(infilename)
    (outdir, infilebase) = os.path.split(infilename)
    outfilebase = re_excelfilename.sub('', infilebase)
    save_csv_tables(tables, outdir, outfilebase)

Exporter Excel en CSV

Python

Discussions similaires

Partager

Partager