Analyse XML - Problème de performance
Bonsoir.
J'utilise actuellement un script pour extraire des infos de mes playlists iTunes à partir du fichier XML propre à iTunes. Or ce fichier étant volumineux, d'une taille de 52,6 Mo, l'utilisation de mon script est très consommatrice de mémoire vive. Je souhaite donc améliorer ceci.
Voici la structure générale du fichier XML. Tout d'abord les chansons sont stockées ainsi.
Code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| <key>6642</key>
<dict>
<key>Track ID</key><integer>6642</integer>
<key>Name</key><string>...</string>
<key>Artist</key><string>...</string>
<key>Album Artist</key><string>...</string>
<key>Composer</key><string>...</string>
<key>Album</key><string>...</string>
<key>Genre</key><string>...</string>
<key>Kind</key><string>...</string>
<key>Size</key><integer>...</integer>
<key>Total Time</key><integer>...</integer>
<key>Track Number</key><integer>...</integer>
<key>Year</key><integer>...</integer>
<key>Date Modified</key><date>...</date>
<key>Date Added</key><date>...</date>
<key>Bit Rate</key><integer>...</integer>
<key>Sample Rate</key><integer>...</integer>
<key>Persistent ID</key><string>...</string>
<key>Track Type</key><string>...</string>
<key>Location</key><string>...</string>
<key>File Folder Count</key><integer>...</integer>
<key>Library Folder Count</key><integer>...</integer>
</dict> |
Les playlists sont stockées comme suit.
Code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| <dict>
<key>Name</key><string>nameOfThPlaylist</string>
<key>Playlist ID</key><integer>87635</integer>
<key>Playlist Persistent ID</key><string>9A4C8A8C60D68000</string>
<key>Parent Persistent ID</key><string>60F56F7F0F05D2B3</string>
<key>All Items</key><true/>
<key>Playlist Items</key>
<array>
<dict>
<key>Track ID</key><integer>12010</integer>
</dict>
...
</array>
</dict> |
Globalement, le fichier XML ressemble à ceci.
Code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Major Version</key><integer>1</integer>
<key>Minor Version</key><integer>1</integer>
<key>Application Version</key><string>9.2</string>
<key>Features</key><integer>5</integer>
<key>Show Content Ratings</key><true/>
<key>Music Folder</key><string>...</string>
<key>Library Persistent ID</key><string>44A5D91826563D8D</string>
<key>Tracks</key>
<dict>
The definitions of the song (see before)
</dict>
<key>Playlists</key>
<array>
<dict>
<key>Name</key><string>...</string>
<key>Playlist ID</key><integer>...</integer>
<key>Playlist Persistent ID</key><string>AF0AD7E90AB4E728</string>
<key>Parent Persistent ID</key><string>B03891ABA2B5AB4C</string>
<key>All Items</key><true/>
<key>Folder</key><true/>
<key>Playlist Items</key>
<array>
<dict>
<key>Track ID</key><integer>...</integer>
</dict>
<dict>
<key>Track ID</key><integer>...</integer>
</dict>
</array>
</dict>
<dict>
The definitions of the playlists in the folder.
</dict>
</array>
</dict>
</plist> |
Mon code Python ressemble à l'horreur qui suit.
Code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
| #! /usr/bin/env python
import xml.etree.ElementTree as etree
import urllib.request
SONGS = {}
PLAYLISTS_ID_CONTAINER = {}
PLAYLISTS_FOLDER = {}
FOLDERS = {'': {}} # This is for playlists that are not in a folder.
ALIAS = {
'Track ID': "newSong",
'Name' : "songName",
'Artist' : "artistName",
'Composer': "composer",
'Album': "album",
'Genre': "style",
'Total Time': "time_ms",
'Location': "path"
}
def getSongs(elem):
lastKey = ''
newSong = False
text = elem.text or ""
for e in elem:
textInsideCurrentTag = getSongs(e).strip()
# A new song
if e.tag == 'key':
if textInsideCurrentTag in ALIAS:
lastKey = ALIAS[textInsideCurrentTag]
if e.tag == 'integer':
if lastKey == "newSong":
currentIndexSong = cleanSpecialCharacters(textInsideCurrentTag)
SONGS[currentIndexSong] = {}
lastKey = ""
elif lastKey in [ "time_ms" ]:
SONGS[currentIndexSong][lastKey] = cleanSpecialCharacters(textInsideCurrentTag)
lastKey = ""
elif e.tag == 'string' and lastKey == "path":
textInsideCurrentTag = cleanSpecialCharacters(textInsideCurrentTag)
if textInsideCurrentTag.startswith(r'file://localhost'):
textInsideCurrentTag = textInsideCurrentTag [len(r'file://localhost'):]
SONGS[currentIndexSong][lastKey] = cleanSpecialCharacters(textInsideCurrentTag)
lastKey = ""
elif e.tag == 'string' \
and lastKey in [
"songName",
"artistName" ,
"composer",
"album",
"style"
]:
SONGS[currentIndexSong][lastKey] = cleanSpecialCharacters(textInsideCurrentTag)
lastKey = ""
if e.tail:
text += e.tail
return text
def getPlaylist(elem):
global THE_PLAYLIST_IDS
global PLAYLIST_NAME
global WE_HAVE_FOLDER
global ID_CONTAINER
global ID_OBJECT
lastKey = ''
text = elem.text or ""
for e in elem:
textInsideCurrentTag = getPlaylist(e).strip()
# A new song
if e.tag == 'key':
if textInsideCurrentTag == 'Track ID':
lastKey = ALIAS[textInsideCurrentTag]
elif textInsideCurrentTag == 'Folder':
WE_HAVE_FOLDER = True
elif textInsideCurrentTag == 'Name':
lastKey = 'playlistName'
elif textInsideCurrentTag == 'Parent Persistent ID':
lastKey = 'inFolder'
elif textInsideCurrentTag == 'Playlist Persistent ID':
lastKey = 'identity'
elif e.tag == 'integer' and lastKey == "newSong":
THE_PLAYLIST_IDS.append(textInsideCurrentTag)
lastKey = ""
elif e.tag == 'string':
if PLAYLIST_NAME == '' and lastKey == 'playlistName':
PLAYLIST_NAME = e.text.strip()
lastKey = ""
elif lastKey == 'inFolder':
ID_CONTAINER = e.text.strip()
lastKey = ""
elif lastKey == 'identity':
ID_OBJECT = e.text.strip()
lastKey = ""
if e.tail:
text += e.tail
return text
def printFolder(idObject):
global FOLDERS
if 'container' in FOLDERS[idObject]:
if FOLDERS[idObject]['container']:
return printFolder(FOLDERS[idObject]['container']) + '/' + FOLDERS[idObject]['name']
return FOLDERS[idObject]['name']
return ''
# Here we go...
print("",
"===== READING iTunes Library INFOS =====",
"1) Building the tree of the XML iTunes file. This can take a long time...",
sep = '\n')
tree = etree.parse(pathOfThePlaylist)
print("2) Looking for all the songs. This can take a long time...")
getSongs(tree.getroot().find('dict/dict'))
print("3) Looking for all the playlists...")
folders = []
for onePlaylist in tree.getroot().findall('dict/array/dict'):
THE_PLAYLIST_IDS = []
PLAYLIST_NAME = ''
WE_HAVE_FOLDER = False
ID_CONTAINER = ''
ID_OBJECT = ''
getPlaylist(onePlaylist)
if WE_HAVE_FOLDER:
FOLDERS[ID_OBJECT] = { 'name': PLAYLIST_NAME,
'container': ID_CONTAINER
}
else:
if ID_CONTAINER in PLAYLISTS_ID_CONTAINER:
PLAYLISTS_ID_CONTAINER[ID_CONTAINER][PLAYLIST_NAME]= THE_PLAYLIST_IDS
else:
PLAYLISTS_ID_CONTAINER[ID_CONTAINER] = {PLAYLIST_NAME: THE_PLAYLIST_IDS}
for oneIdContainer in PLAYLISTS_ID_CONTAINER:
PLAYLISTS_FOLDER[printFolder(oneIdContainer)] = PLAYLISTS_ID_CONTAINER[oneIdContainer]
print("4) Reading iTunes Library infos is finished.") |
Dans le code ci-dessus, c'est l'utilisation de etree.parse(pathOfThePlaylist) qui pose problème. Comment éviter son emploi ?