I could give the final script right there it's less than 30 lines of code and pretty clear : you move file from the origin if there is nothing in destination formed with the name of the artist
#!/usr/bin/env python
from mutagen.mp3 import MP3
from mutagen.flac import FLAC
from mutagen.easyid3 import EasyID3
from pathlib import Path
import shutil
root=Path(Path.home() / "Musique" /"Music" / "incoming")
dest_root_dir=Path(Path.home() / "Musique")
import os
for r,d, f in os.walk(root):
for file in f:
try:
reader = lambda fl: MP3(x, ID3=EasyID3)
if file.endswith("flac") or file.endswith("FLAC"):
reader=FLAC
mp3= reader(os.path.join(r,file))
fn= os.path.join(r , file)
artist=mp3.get('artist', ["tosort"])[0].strip()
album= mp3.get('album', ["tosort"])[0].strip()
dst_dir = Path(dest_root_dir / artist / album )
destination=Path(dst_dir / file)
print(f"{fn}->{destination}")
try:
os.makedirs(dst_dir)
except:
pass
if not destination.is_file():
shutil.move( fn, destination )
except Exception as e:
print("Arg %s for file «%s»" % (repr(e), file))
But, what if you had more than one choice for solving the doublons ? One being good and the other one corrupted or worse ... truncated ?
For the fun, we are gonna try to see how bad the situation is before moving files and if it's worhty. First out of my small playlist of 5300 do I have the ID3 tags album and artist defined for everyone ?
#!/usr/bin/env python
from mutagen.mp3 import MP3
from mutagen.easyid3 import EasyID3
from archery import mdict
import os
SEEN=mdict()
for r,d, f in os.walk("/home/jul/Musique"):
for file in f:
try:
SEEN += mdict({ k: 1 for k in MP3(os.path.join(r,file), ID3=EasyID3).keys() })
except Exception as e:
print("Arg %s for file «%s»" % (repr(e), file))
from json import dumps
print(dumps(dict(reversed(sorted(SEEN.items(), key=lambda item: item[1]))), indent=4))
The answer is ...
{
"title": 3891,
"artist": 3879,
"genre": 3394,
"album": 3248,
"tracknumber": 2830,
"date": 2572,
...
}
20% of the files don't seem to have any MP3 tags at all (metadata). My mp3 have been gathered since 1996 so it's quite normal :D
Second, how much collision do we have ? Either by hash or by filename ?
It's a tad overkill, but since I have postgres running, let's put all the data in base :
DROP TABLE IF EXISTS public.mp3;
CREATE TABLE IF NOT EXISTS public.mp3
(
filename text COLLATE pg_catalog."default" NOT NULL,
artist text COLLATE pg_catalog."default",
album text COLLATE pg_catalog."default",
title text COLLATE pg_catalog."default",
date text COLLATE pg_catalog."default",
tracknumber text COLLATE pg_catalog."default",
destination text COLLATE pg_catalog."default",
hash text COLLATE pg_catalog."default",
CONSTRAINT mail_pkey PRIMARY KEY (filename)
)
TABLESPACE pg_default;
ALTER TABLE IF EXISTS public.mp3
OWNER to jul;
And then we insert all data in the database
#!/usr/bin/env python
from mutagen.mp3 import MP3
from mutagen.easyid3 import EasyID3
from pathlib import Path
import psycopg2
import os
conn = psycopg2.connect("dbname=mp3 user=jul")
root=Path(Path.home() / "Musique")
for r,d, f in os.walk(root):
for file in f:
try:
if file.lower().endswith("mp3"):
mp3= MP3(os.path.join(r,file), ID3=EasyID3)
with conn.cursor() as sql:
fn= os.path.join(r , file)
artist=mp3.get('artist', ["unk"])[0].strip()
album= mp3.get('album', ["unk"])[0].strip()
title=mp3.get('title', ["unk"])[0].strip()
destination=Path(root / artist / album / file)
#from pdb import set_trace;set_trace()
sql.execute("""
INSERT INTO mp3
(filename, artist, album, title, tracknumber, "date", destination, hash
)
VALUES ( %s, %s, %s, %s, %s, %s, %s, %s);
""",(
fn,
artist,
album,
title,
mp3.get('tracknumber', ["unk"])[0].strip(),
mp3.get('date', ["unk"])[0].strip(),
str(destination),
hash(open(fn,"rb").read()),
))
conn.commit()
except Exception as e:
print("Arg %s for file «%s»" % (repr(e), file))
The python hash function beats cryptographic hash in speed and is so used it bulletproof.
For detecting collision, it is « good enough ».
Now, let's see if it's worthy to clean up my mess ?
select count(*) from mp3 m
INNER JOIN
(select count(*) as ord, hash from mp3 GROUP BY hash ) sec
ON m.hash=sec.hash
WHERE sec.ord > 1 ;
count | 358
Nearly 10% ! That's a good catch.
Actually we count some files more than once. Let's try to see how much files we expect in the end.First, let's count files unique by hash :
select count(*) from mp3 m
inner join
(select count(*) as ord, hash from mp3 GROUP BY hash HAVING count(hash) = 1 ) sec
on m.hash = sec.hash;
count
-------
5101
Let's count unique expected destination for multiple hash
select count(distinct(m.destination)) from mp3 m
inner join
(select count(*) as ord, hash from mp3 GROUP BY hash HAVING count(hash) != 1 ) sec
on m.hash = sec.hash;
count
-------
185
So .... We expect 5286 mp3 files in the destination directory :
find . -type f | grep -i "mp3$"| wc -l 5281Well, it accounts for the OS errors due to me having not checked if the mp3 files where actually not misnamed format.
The world being not about perfection but pragmatism, I call it a win for half a day of having fun and cleaning up :D
No comments:
Post a Comment