diff options
Diffstat (limited to 'raphodo/scan.py')
-rwxr-xr-x | raphodo/scan.py | 1241 |
1 files changed, 1241 insertions, 0 deletions
diff --git a/raphodo/scan.py b/raphodo/scan.py new file mode 100755 index 0000000..1a6de86 --- /dev/null +++ b/raphodo/scan.py @@ -0,0 +1,1241 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2011-2017 Damon Lynch <damonlynch@gmail.com> + +# This file is part of Rapid Photo Downloader. +# +# Rapid Photo Downloader is free software: you can redistribute it and/or +# modify it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Rapid Photo Downloader is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Rapid Photo Downloader. If not, +# see <http://www.gnu.org/licenses/>. + +""" +Scans directory looking for photos and videos, and any associated files +external to the actual photo/video including thumbnail files, XMP files, and +audio files that are linked to a photo. + +Returns results using the 0mq pipeline pattern. + +Photo and movie metadata is (for the most part) not read during this +scan process, because doing so is too slow. However, as part of scanning a +device, there are two aspects to metadata that are in fact needed: + +1. A sample of photo and video metadata, that is used to demonstrate file + renaming. That is one sample photo, and one sample video. + +2. The device's time zone must be determined, as camera handle their time + zone setting differently from phones, and results can be unpredictable. + Therefore need to analyze the created date time metadata of a file the + device and compare it against the file modification time on the file system + or more importantly, gphoto2. It's not an exact science and there are + problems, but doing this is better than not doing it at all. + +""" + +__author__ = 'Damon Lynch' +__copyright__ = "Copyright 2011-2017, Damon Lynch" + +import os +import sys +import pickle +import logging +from collections import (namedtuple, defaultdict, deque) +from datetime import datetime +import tempfile +import operator + +if sys.version_info < (3,5): + import scandir + walk = scandir.walk +else: + walk = os.walk +from typing import List, Dict, Union, Optional, Iterator, Tuple, DefaultDict + +import gphoto2 as gp + +# Instances of classes ScanArguments and ScanPreferences are passed via pickle +# Thus do not remove these two imports +from raphodo.interprocess import ScanArguments +from raphodo.preferences import ScanPreferences, Preferences +from raphodo.interprocess import (WorkerInPublishPullPipeline, ScanResults, + ScanArguments) +from raphodo.camera import Camera, CameraError, CameraProblemEx +import raphodo.rpdfile as rpdfile +from raphodo.constants import (DeviceType, FileType, DeviceTimestampTZ, CameraErrorCode, + FileExtension, ThumbnailCacheDiskStatus, all_tags_offset, ExifSource) +from raphodo.rpdsql import DownloadedSQL, FileDownloaded +from raphodo.cache import ThumbnailCacheSql +from raphodo.utilities import (stdchannel_redirected, datetime_roughly_equal, + GenerateRandomFileName, format_size_for_user) +from raphodo.exiftool import ExifTool +import raphodo.metadatavideo as metadatavideo +import raphodo.metadataphoto as metadataphoto +from raphodo.problemnotification import ( + ScanProblems, UnhandledFileProblem, CameraDirectoryReadProblem, CameraFileInfoProblem, + CameraFileReadProblem, FileMetadataLoadProblem, FileWriteProblem, FsMetadataReadProblem, + FileZeroLengthProblem +) +from raphodo.storage import get_uri, CameraDetails + +FileInfo = namedtuple('FileInfo', 'path modification_time size ext_lower base_name file_type') +CameraFile = namedtuple('CameraFile', 'name size') +CameraMetadataDetails = namedtuple('CameraMetadataDetails', + 'path name size extension mtime file_type') +SampleMetadata = namedtuple('SampleMetadata', 'datetime determined_by') + + +class ScanWorker(WorkerInPublishPullPipeline): + + def __init__(self): + self.downloaded = DownloadedSQL() + self.thumbnail_cache = ThumbnailCacheSql() + self.no_previously_downloaded = 0 + self.file_batch = [] + self.batch_size = 50 + self.file_type_counter = rpdfile.FileTypeCounter() + self.file_size_sum = rpdfile.FileSizeSum() + self.device_timestamp_type = DeviceTimestampTZ.undetermined + + # full_file_name (path+name):timestamp + self.file_mdatatime = {} # type: Dict[str, float] + + self.sample_exif_bytes = None # type: bytes + self.sample_exif_source = None # type: ExifSource + self.sample_photo = None # type: rpdfile.Photo + self.sample_video = None # type: rpdfile.Video + self.sample_video_extract_full_file_name = None # type: Optional[str] + self.sample_photo_file_full_file_name = None # type: Optional[str] + self.sample_video_file_full_file_name = None # type: Optional[str] + self.sample_video_full_file_downloaded = None # type: Optional[bool] + self.found_sample_photo = False + self.found_sample_video = False + + self.prefs = Preferences() + self.scan_preferences = ScanPreferences(self.prefs.ignored_paths) + + self.problems = ScanProblems() + + self._camera_details = None # type: Optional[CameraDetails] + + super().__init__('Scan') + + def do_work(self) -> None: + try: + self.do_scan() + except Exception as e: + try: + device = self.display_name + except AttributeError: + device = '' + logging.exception("Unexpected exception while scanning %s", device) + + self.content = pickle.dumps( + ScanResults(scan_id=int(self.worker_id), fatal_error=True), + pickle.HIGHEST_PROTOCOL + ) + self.send_message_to_sink() + self.disconnect_logging() + self.send_finished_command() + + def do_scan(self) -> None: + logging.debug("Scan {} worker started".format(self.worker_id.decode())) + + scan_arguments = pickle.loads(self.content) # type: ScanArguments + if scan_arguments.log_gphoto2: + gp.use_python_logging() + + if scan_arguments.ignore_other_types: + rpdfile.PHOTO_EXTENSIONS_SCAN = rpdfile.PHOTO_EXTENSIONS_WITHOUT_OTHER + + self.device = scan_arguments.device + + self.download_from_camera = scan_arguments.device.device_type == DeviceType.camera + self.camera_storage_descriptions = [] + if self.download_from_camera: + self.camera_model = scan_arguments.device.camera_model + self.camera_port = scan_arguments.device.camera_port + self.is_mtp_device = scan_arguments.device.is_mtp_device + self.camera_display_name = scan_arguments.device.display_name + self.display_name = self.camera_display_name + self.ignore_mdatatime_for_mtp_dng = self.is_mtp_device and \ + self.prefs.ignore_mdatatime_for_mtp_dng + else: + self.camera_port = self.camera_model = self.is_mtp_device = None + self.ignore_mdatatime_for_mtp_dng = False + self.camera_display_name = None + + self.files_scanned = 0 + self.camera = None + + if not self.download_from_camera: + # Download from file system + path = os.path.abspath(scan_arguments.device.path) + if not self.prefs.device_without_dcim_autodetection and \ + scan_arguments.device.device_type == DeviceType.volume: + path = os.path.join(path, "DCIM") + self.display_name = scan_arguments.device.display_name + # Scan the files using lightweight high-performance scandir + logging.info("Scanning {}".format(self.display_name)) + + self.problems.uri = get_uri(path=path) + self.problems.name = self.display_name + + # Before doing anything else, determine time zone approach + # Need two different walks because first folder of files + # might be videos, then the 2nd folder photos, etc. + self.distinguish_non_camera_device_timestamp(path) + + if self.scan_preferences.scan_this_path(path): + for dir_name, name in self.walk_file_system(path): + self.dir_name = dir_name + self.file_name = name + self.process_file() + + else: + # scanning directly from camera + have_optimal_display_name = scan_arguments.device.have_optimal_display_name + while True: + try: + self.camera = Camera(model=scan_arguments.device.camera_model, + port=scan_arguments.device.camera_port, + raise_errors=True) + if not have_optimal_display_name: + # Update the GUI with the real name of the camera + # and its storage information + have_optimal_display_name = True + self.camera_display_name = self.camera.display_name + self.display_name = self.camera_display_name + storage_space = self.camera.get_storage_media_capacity(refresh=True) + storage_descriptions = self.camera.get_storage_descriptions() + self.content = pickle.dumps( + ScanResults( + optimal_display_name=self.camera_display_name, + storage_space=storage_space, + storage_descriptions=storage_descriptions, + scan_id=int(self.worker_id), + ), + pickle.HIGHEST_PROTOCOL + ) + self.send_message_to_sink() + break + except CameraProblemEx as e: + self.content = pickle.dumps(ScanResults( + error_code=e.code, + scan_id=int(self.worker_id)), + pickle.HIGHEST_PROTOCOL) + self.send_message_to_sink() + # Wait for command to resume or halt processing + self.resume_work() + + if self.download_from_camera: + self.camera_details = 0 + self.problems.uri = get_uri(camera_details=self.camera_details) + self.problems.name = self.display_name + + if self.ignore_mdatatime_for_mtp_dng: + logging.info("For any DNG files on the %s, when determining the creation date/" + "time, the metadata date/time will be ignored, and the file " + "modification date/time used instead", self.display_name) + + # Download only from the DCIM folder(s) in the camera. + # Phones especially have many directories with images, which we + # must ignore + if self.camera.camera_has_dcim(): + logging.info("Scanning {}".format(self.display_name)) + self._camera_folders_and_files = [] + self._camera_file_names = defaultdict(list) + self._camera_audio_files = defaultdict(list) + self._camera_video_thumbnails = defaultdict(list) + self._camera_xmp_files = defaultdict(list) + self._folder_identifiers = {} + self._folder_identifers_for_file = \ + defaultdict(list) # type: DefaultDict[int, List[int]] + self._camera_directories_for_file = defaultdict(list) + self._camera_photos_videos_by_type = \ + defaultdict(list) # type: DefaultDict[FileExtension, List[CameraMetadataDetails]] + + dcim_folders = self.camera.dcim_folders + + if len(dcim_folders) > 1: + # This camera has dual memory cards. + # Give each folder an numeric identifier that will be + # used to identify which card a given file comes from + dcim_folders.sort() + for idx, folder in enumerate(dcim_folders): + self._folder_identifiers[folder] = idx + 1 + + # locate photos and videos, identifying duplicate files + # identify candidates for extracting metadata + for idx, dcim_folder in enumerate(dcim_folders): + # Setup camera details for each storage space in the camera + self.camera_details = idx + # Now initialize the problems container, if not already done so + if idx: + self.problems.name = self.camera_display_name + self.problems.uri = get_uri(camera_details=self.camera_details) + + logging.debug("Scanning %s on %s", dcim_folder, self.camera.display_name) + folder_identifier = self._folder_identifiers.get(dcim_folder) + basedir = dcim_folder[:-len('/DCIM')] + self.locate_files_on_camera(dcim_folder, folder_identifier, basedir) + + # extract non camera metadata + if self._camera_photos_videos_by_type: + self.identify_camera_tz_and_sample_files() + + # now, process each file + for self.dir_name, self.file_name in self._camera_folders_and_files: + self.process_file() + else: + logging.warning("Unable to detect any DCIM folders on %s", self.display_name) + + self.camera.free_camera() + + if self.file_batch: + # Send any remaining files, including the sample photo or video + self.content = pickle.dumps( + ScanResults( + self.file_batch, + self.file_type_counter, + self.file_size_sum, + sample_photo=self.sample_photo, + sample_video=self.sample_video + ), + pickle.HIGHEST_PROTOCOL + ) + self.send_message_to_sink() + + self.send_problems() + + if self.files_scanned > 0 and not (self.files_scanned == 0 and self.download_from_camera): + logging.info("{} total files scanned on {}".format(self.files_scanned, + self.display_name)) + + self.disconnect_logging() + self.send_finished_command() + + def send_problems(self) -> None: + if self.problems: + self.content = pickle.dumps( + ScanResults( + scan_id=int(self.worker_id), problems=self.problems + ), + pickle.HIGHEST_PROTOCOL + ) + self.send_message_to_sink() + + def walk_file_system(self, path_to_walk: str) -> Iterator[Tuple[str, str]]: + """ + Return files on local file system, ignoring those in directories + the user doesn't want scanned + :param path_to_walk: the path to scan + """ + + for dir_name, dir_list, file_list in walk(path_to_walk): + if len(dir_list) > 0: + if self.scan_preferences.ignored_paths: + # Don't inspect paths the user wants ignored + # Altering subdirs in place controls the looping + # [:] ensures the list is altered in place + # (mutating slice method) + dir_list[:] = filter(self.scan_preferences.scan_this_path, dir_list) + for name in file_list: + yield dir_name, name + + def locate_files_on_camera(self, path: str, folder_identifier: int, basedir: str) -> None: + """ + Scans the memory card(s) on the camera for photos, videos, + audio files, and video thumbnail (THM) files. Looks only in the + camera's DCIM folders, which are assumed to have already been + located. + + We cannot assume file names are unique on any one memory card, + as although it's unlikely, it's possible that a file with + the same name might be in different subfolders. + + For cameras with two memory cards, there are two broad + possibilities: + + (!) the cards' contents mirror each other, because the camera + writes the same files to both cards simultaneously + + (2) each card has a different set of files, e.g. because a + different file type is written to each card, or the 2nd card is + used only when the first is full + + In practice, we have to assume that if there are two memory + cards, some files will be identical, and others different. Thus + we have to scan the contents of both cards, analyzing file + names, file modification times and file sizes. + + If a camera has more than one memory card, we store which + card the file came from using a simple numeric identifier i.e. + 1 or 2. + + For duplicate files, we record both directories the file is + stored on. + + :param path: the path on the camera to analyze for files and + folders + :param folder_identifier: if not None, then indicates (1) the + camera being scanned has more than one memory card, and (2) + the simple numeric identifier of the memory card being + scanned right now + :param basedir: the base directory of the path, as reported by + libgphoto2 + """ + + files_in_folder = [] + names = [] + try: + files_in_folder = self.camera.camera.folder_list_files(path, self.camera.context) + except gp.GPhoto2Error as e: + logging.error("Unable to scan files on camera: error %s", e.code) + uri = get_uri(path=path, camera_details=self.camera_details) + self.problems.append(CameraDirectoryReadProblem(uri=uri, name=path, gp_code=e.code)) + + if files_in_folder: + # Distinguish the file type for every file in the folder + names = [name for name, value in files_in_folder] + split_names = [os.path.splitext(name) for name in names] + # Remove the period from the extension + exts = [ext[1:] for name, ext in split_names] + exts_lower = [ext.lower() for ext in exts] + ext_types = [rpdfile.extension_type(ext) for ext in exts_lower] + + for idx, name in enumerate(names): + # Check to see if the process has received a command to terminate + # or pause + self.check_for_controller_directive() + + # Get the information we extracted above + base_name = split_names[idx][0] + ext = exts[idx] + ext_lower = exts_lower[idx] + ext_type = ext_types[idx] + file_type = rpdfile.file_type(ext_lower) + + if file_type is not None: + # file is a photo or video + file_is_unique = True + try: + modification_time, size = self.camera.get_file_info(path, name) + except gp.GPhoto2Error as e: + logging.error( + "Unable to access modification_time or size from %s on %s. Error code: %s", + os.path.join(path, name), self.display_name, e.code + ) + modification_time, size = 0, 0 + uri = get_uri( + full_file_name=os.path.join(path, name), camera_details=self.camera_details + ) + self.problems.append(CameraFileInfoProblem(uri=uri, gp_code=e.code)) + else: + if size <= 0: + full_file_name = os.path.join(path, name) + logging.error( + "Zero length file %s will not be downloaded from %s", + full_file_name, self.display_name + ) + uri = get_uri( + full_file_name=full_file_name, camera_details=self.camera_details + ) + self.problems.append(FileZeroLengthProblem(name=name, uri=uri)) + + if size > 0: + key = rpdfile.make_key(file_type, basedir) + self.file_type_counter[key] += 1 + self.file_size_sum[key] += size + + # Store the directory this file is stored in, used when + # determining if associate files are part of the download + cf = CameraFile(name=name, size=size) + self._camera_directories_for_file[cf].append(path) + + if folder_identifier is not None: + # Store which which card the file came from using a + # simple numeric identifier i.e. 1 or 2. + self._folder_identifers_for_file[cf].append(folder_identifier) + + if name in self._camera_file_names: + for existing_file_info in self._camera_file_names[name]: + # Don't compare file modification time in this + # comparison, because files can be written to + # different cards several seconds apart when + # the write speeds of the cards differ + if existing_file_info.size == size: + file_is_unique = False + break + if file_is_unique: + file_info = FileInfo( + path=path, modification_time=modification_time, + size=size, file_type=file_type, base_name=base_name, + ext_lower=ext_lower + ) + metadata_details = CameraMetadataDetails( + path=path, name=name, size=size, extension=ext_lower, + mtime=modification_time, file_type=file_type + ) + self._camera_file_names[name].append(file_info) + self._camera_folders_and_files.append([path, name]) + self._camera_photos_videos_by_type[ext_type].append(metadata_details) + else: + # this file on the camera is not a photo or video + if ext_lower in rpdfile.AUDIO_EXTENSIONS: + self._camera_audio_files[base_name].append((path, ext)) + elif ext_lower in rpdfile.VIDEO_THUMBNAIL_EXTENSIONS: + self._camera_video_thumbnails[base_name].append((path, ext)) + elif ext_lower == 'xmp': + self._camera_xmp_files[base_name].append((path, ext)) + else: + logging.info("Ignoring unknown file %s on %s", + os.path.join(path, name), self.display_name) + if self.prefs.warn_about_unknown_file(ext=ext): + uri = get_uri( + full_file_name=os.path.join(path, name), + camera_details=self.camera_details + ) + self.problems.append(UnhandledFileProblem(name=name, uri=uri)) + folders = [] + try: + for name, value in self.camera.camera.folder_list_folders(path, self.camera.context): + if self.scan_preferences.scan_this_path(os.path.join(path, name)): + folders.append(name) + except gp.GPhoto2Error as e: + logging.error("Unable to scan files on %s. Error code: %s", self.display_name, e.code) + uri = get_uri(path=path, camera_details=self.camera_details) + self.problems.append(CameraDirectoryReadProblem(uri=uri, name=path, gp_code=e.code)) + + # recurse over subfolders + for name in folders: + self.locate_files_on_camera(os.path.join(path, name), folder_identifier, basedir) + + def identify_camera_tz_and_sample_files(self) -> None: + """ + Get sample metadata for photos and videos, and determine device timezone setting. + """ + + # do in place sort of jpegs, RAWs and videos by file size + for files in self._camera_photos_videos_by_type.values(): + files.sort(key=operator.attrgetter('size')) + + # When determining how a camera reports modification time, extraction order + # of preference is (1) jpeg, (2) RAW, and finally least preferred is (3) video + # However, if ignore_mdatatime_for_mtp_dng is set, ignore the RAW files + + if not self.ignore_mdatatime_for_mtp_dng: + order = (FileExtension.jpeg, FileExtension.raw, FileExtension.video) + else: + order = (FileExtension.jpeg, FileExtension.video, FileExtension.raw) + + have_photos = len(self._camera_photos_videos_by_type[FileExtension.raw]) > 0 or \ + len(self._camera_photos_videos_by_type[FileExtension.jpeg]) > 0 + have_videos = len(self._camera_photos_videos_by_type[FileExtension.video]) > 0 + + max_attempts = 5 + for ext_type in order: + for file in self._camera_photos_videos_by_type[ext_type][:max_attempts]: \ + # type: CameraMetadataDetails + get_tz = self.device_timestamp_type == DeviceTimestampTZ.undetermined and not ( + self.ignore_mdatatime_for_mtp_dng and ext_type == FileExtension.raw) + get_sample_metadata = ( + file.file_type == FileType.photo and self.sample_exif_source is None) or ( + file.file_type == FileType.video and + self.sample_video_extract_full_file_name is None) + + if get_tz or get_sample_metadata: + logging.info("Extracting sample %s metadata for %s", + file.file_type.name, self.camera_display_name) + sample = self.sample_camera_metadata( + path=file.path, name=file.name, ext_type=ext_type, extension=file.extension, + modification_time=file.mtime, size=file.size) + if get_tz: + self.determine_device_timestamp_tz(sample.datetime, file.mtime, + sample.determined_by) + need_sample_photo = self.sample_exif_source is None and have_photos + need_sample_video = self.sample_video_extract_full_file_name is None and \ + have_videos + if not (need_sample_photo or need_sample_video): + break + + def process_file(self) -> None: + # Check to see if the process has received a command to terminate or + # pause + self.check_for_controller_directive() + + file = os.path.join(self.dir_name, self.file_name) + + # do we have permission to read the file? + if self.download_from_camera or os.access(file, os.R_OK): + + # count how many files of each type are included + # i.e. how many photos and videos + self.files_scanned += 1 + if not self.files_scanned % 10000: + logging.info("Scanned {} files".format( + self.files_scanned)) + + if not self.download_from_camera: + base_name, ext = os.path.splitext(self.file_name) + ext = ext.lower()[1:] + file_type = rpdfile.file_type(ext) + + # For next code block, see comment in + # self.distinguish_non_camera_device_timestamp() + # This only applies to files being scanned on the file system, not + # cameras / phones. + if file_type == FileType.photo and self.sample_exif_source is None: + # this should never happen due to photos being prioritized over videos + # with respect to time zone determination + logging.error("Sample metadata not extracted from photo %s although it should " + "have been used to determine the device timezone", self.file_name) + elif file_type == FileType.video and self.sample_video_file_full_file_name is None: + self.sample_non_camera_metadata(self.dir_name, self.file_name, file, + FileExtension.video) + else: + base_name = None + for file_info in self._camera_file_names[self.file_name]: + if file_info.path == self.dir_name: + base_name = file_info.base_name + ext = file_info.ext_lower + file_type = file_info.file_type + break + assert base_name is not None + + if file_type is not None: + self.file_type_counter[file_type] += 1 + + if self.download_from_camera: + modification_time = file_info.modification_time + # zero length files have already been filtered out + size = file_info.size + camera_file = CameraFile(name=self.file_name, size=size) + else: + stat = os.stat(file) + size = stat.st_size + if size <= 0: + logging.error( + "Zero length file %s will not be downloaded from %s", + file, self.display_name + ) + uri = get_uri(full_file_name=file) + self.problems.append(FileZeroLengthProblem(name=self.file_name, uri=uri)) + return + modification_time = stat.st_mtime + camera_file = None + + self.file_size_sum[file_type] += size + + # look for thumbnail file (extension THM) for videos + if file_type == FileType.video: + thm_full_name = self.get_video_THM_file(base_name, camera_file) + else: + thm_full_name = None + + # check if an XMP file is associated with the photo or video + xmp_file_full_name = self.get_xmp_file(base_name, camera_file) + + # check if an audio file is associated with the photo or video + audio_file_full_name = self.get_audio_file(base_name, camera_file) + + # has the file been downloaded previously? + # note: we should use the adjusted mtime, not the raw one + adjusted_mtime = self.adjusted_mtime(modification_time) + + downloaded = self.downloaded.file_downloaded( + name=self.file_name, + size=size, + modification_time=adjusted_mtime) + + thumbnail_cache_status = ThumbnailCacheDiskStatus.unknown + + # Assign metadata time, if we have it + # If we don't, it will be extracted when thumbnails are generated + mdatatime = self.file_mdatatime.get(file, 0.0) + + ignore_mdatatime = self.ignore_mdatatime(ext=ext) + + if not mdatatime and self.prefs.use_thumbnail_cache and not ignore_mdatatime: + # Was there a thumbnail generated for the file? + # If so, get the metadata date time from that + get_thumbnail = self.thumbnail_cache.get_thumbnail_path( + full_file_name=file, mtime=adjusted_mtime, + size=size, camera_model=self.camera_model + ) + thumbnail_cache_status = get_thumbnail.disk_status + if thumbnail_cache_status in ( + ThumbnailCacheDiskStatus.found, ThumbnailCacheDiskStatus.failure): + mdatatime = get_thumbnail.mdatatime + + if downloaded is not None: + self.no_previously_downloaded += 1 + prev_full_name = downloaded.download_name + prev_datetime = downloaded.download_datetime + else: + prev_full_name = prev_datetime = None + + if self.download_from_camera: + camera_memory_card_identifiers = self._folder_identifers_for_file[camera_file] + if not camera_memory_card_identifiers: + camera_memory_card_identifiers = None + else: + camera_memory_card_identifiers = None + + problem=None + + rpd_file = rpdfile.get_rpdfile( + name=self.file_name, + path=self.dir_name, + size=size, + prev_full_name=prev_full_name, + prev_datetime=prev_datetime, + device_timestamp_type=self.device_timestamp_type, + mtime=modification_time, + mdatatime=mdatatime, + thumbnail_cache_status=thumbnail_cache_status, + thm_full_name=thm_full_name, + audio_file_full_name=audio_file_full_name, + xmp_file_full_name=xmp_file_full_name, + scan_id=self.worker_id, + file_type=file_type, + from_camera=self.download_from_camera, + camera_details=self.camera_details, + camera_memory_card_identifiers=camera_memory_card_identifiers, + never_read_mdatatime=ignore_mdatatime, + device_display_name=self.display_name, + device_uri=self.device.uri, + raw_exif_bytes=None, + exif_source=None, + problem=problem + ) + + self.file_batch.append(rpd_file) + + if not self.found_sample_photo and file == self.sample_photo_file_full_file_name: + self.sample_photo = self.create_sample_rpdfile(name=self.file_name, + path=self.dir_name, + size=size, + mdatatime=mdatatime, + file_type=FileType.photo, + mtime=modification_time, + ignore_mdatatime=ignore_mdatatime) + self.sample_exif_bytes = None + self.found_sample_photo = True + + if not self.found_sample_video and file == self.sample_video_file_full_file_name: + self.sample_video = self.create_sample_rpdfile(name=self.file_name, + path=self.dir_name, + size=size, + mdatatime=mdatatime, + file_type=FileType.video, + mtime=modification_time, + ignore_mdatatime=ignore_mdatatime) + if self.sample_video_full_file_downloaded: + rpd_file.cache_full_file_name = self.sample_video_extract_full_file_name + self.sample_video_extract_full_file_name = None + self.found_sample_video = True + + if len(self.file_batch) == self.batch_size: + self.content = pickle.dumps(ScanResults( + rpd_files=self.file_batch, + file_type_counter=self.file_type_counter, + file_size_sum=self.file_size_sum, + sample_photo=self.sample_photo, + sample_video=self.sample_video), + pickle.HIGHEST_PROTOCOL) + self.send_message_to_sink() + self.file_batch = [] + self.sample_photo = None + self.sample_video = None + + def send_message_to_sink(self) -> None: + try: + logging.debug( + "Sending %s scanned files from %s to sink", len(self.file_batch), self.display_name + ) + except AttributeError: + pass + super().send_message_to_sink() + + def ignore_mdatatime(self, ext: str) -> bool: + return self.ignore_mdatatime_for_mtp_dng and ext == 'dng' + + def create_sample_rpdfile(self, path: str, + name: str, + size: int, + mdatatime: float, + file_type: FileType, + mtime: float, + ignore_mdatatime: bool) -> Union[rpdfile.Photo, rpdfile.Video]: + assert (self.sample_exif_source is not None and self.sample_photo_file_full_file_name or + self.sample_video_file_full_file_name is not None) + logging.info("Successfully extracted sample %s metadata from %s", + file_type.name, self.display_name) + problem=None + rpd_file = rpdfile.get_rpdfile( + name=name, + path=path, + size=size, + prev_full_name=None, + prev_datetime=None, + device_timestamp_type=self.device_timestamp_type, + mtime=mtime, + mdatatime=mdatatime, + thumbnail_cache_status=ThumbnailCacheDiskStatus.unknown, + thm_full_name=None, + audio_file_full_name=None, + xmp_file_full_name=None, + scan_id=self.worker_id, + file_type=file_type, + from_camera=self.download_from_camera, + camera_details=self.camera_details, + camera_memory_card_identifiers=None, + never_read_mdatatime=ignore_mdatatime, + device_display_name=self.display_name, + device_uri=self.device.uri, + raw_exif_bytes=self.sample_exif_bytes, + exif_source=self.sample_exif_source, + problem=problem + ) + if file_type == FileType.video and self.download_from_camera: + # relevant only when downloading from a camera + rpd_file.temp_sample_full_file_name = self.sample_video_extract_full_file_name + rpd_file.temp_sample_is_complete_file = self.sample_video_full_file_downloaded + + return rpd_file + + def sample_camera_metadata(self, path: str, + name: str, + extension: str, + ext_type: FileExtension, + size: int, + modification_time: int) -> SampleMetadata: + """ + Extract sample metadata, including specifically datetime, from a photo or video on a camera + Video files are special in that sometimes the entire file has to be read in order to extract + its metadata. + """ + + dt = determined_by = None + use_app1 = save_chunk = exif_extract = False + if ext_type == FileExtension.jpeg: + determined_by = 'jpeg' + if self.camera.can_fetch_thumbnails: + use_app1 = True + else: + exif_extract = True + elif ext_type == FileExtension.raw: + determined_by = 'RAW' + exif_extract = True + elif ext_type == FileExtension.video: + determined_by = 'video' + save_chunk = True + + if use_app1: + try: + self.sample_exif_bytes = self.camera.get_exif_extract_from_jpeg(path, name) + except CameraProblemEx as e: + uri = get_uri(full_file_name=os.path.join(path, name), + camera_details=self.camera_details) + self.problems.append(CameraFileReadProblem(uri=uri, name=name, gp_code=e.gp_code)) + else: + try: + with stdchannel_redirected(sys.stderr, os.devnull): + metadata = metadataphoto.MetaData(app1_segment=self.sample_exif_bytes) + except: + logging.warning("Scanner failed to load metadata from %s on %s", name, + self.camera.display_name) + self.sample_exif_bytes = None + uri = get_uri(full_file_name=os.path.join(path, name), + camera_details=self.camera_details) + self.problems.append(FileMetadataLoadProblem(uri=uri, name=name)) + else: + self.sample_exif_source = ExifSource.app1_segment + self.sample_photo_file_full_file_name = os.path.join(path, name) + dt = metadata.date_time(missing=None) # type: datetime + elif exif_extract: + offset = all_tags_offset.get(extension) + if offset is None: + offset = size + offset = min(size, offset) + self.sample_exif_bytes = self.camera.get_exif_extract(path, name, offset) + if self.sample_exif_bytes is not None: + try: + with stdchannel_redirected(sys.stderr, os.devnull): + metadata = metadataphoto.MetaData(raw_bytes=self.sample_exif_bytes) + except: + logging.warning("Scanner failed to load metadata from %s on %s", name, + self.camera.display_name) + self.sample_exif_bytes = None + uri = get_uri(full_file_name=os.path.join(path, name), + camera_details=self.camera_details) + self.problems.append(FileMetadataLoadProblem(uri=uri, name=name)) + else: + self.sample_exif_source = ExifSource.raw_bytes + self.sample_photo_file_full_file_name = os.path.join(path, name) + dt = metadata.date_time(missing=None) # type: datetime + else: + assert save_chunk + # video + offset = all_tags_offset.get(extension) + if offset is None: + max_size = 1024**2 * 20 # approx 21 MB + offset = min(size, max_size) + + # First try offset value, and if it fails, read the entire video + # Reading the metadata on some videos will fail if the entire video + # is not read, e.g. an iPhone 5 video + temp_name = os.path.join(tempfile.gettempdir(), + GenerateRandomFileName().name(extension=extension)) + with ExifTool() as et_process: + for chunk_size in (offset, size): + if chunk_size == size: + logging.debug("Downloading entire video for metadata sample (%s)", + format_size_for_user(size)) + mtime = int(self.adjusted_mtime(float(modification_time))) + try: + self.camera.save_file_chunk(path, name, chunk_size, temp_name, mtime) + except CameraProblemEx as e: + if e.code == CameraErrorCode.read: + uri = get_uri(os.path.join(path, name), + camera_details=self.camera_details) + self.problems.append(CameraFileReadProblem(uri=uri, name=name, + gp_code=e.gp_code)) + else: + assert e.code == CameraErrorCode.write + uri = get_uri(path=os.path.dirname(temp_name)) + self.problems.append(FileWriteProblem(uri=uri, name=temp_name, + exception=e.py_exception)) + else: + metadata = metadatavideo.MetaData(temp_name, et_process) + dt = metadata.date_time(missing=None, ignore_file_modify_date=True) + width = metadata.width(missing=None) + height = metadata.height(missing=None) + if dt is not None and width is not None and height is not None: + self.sample_video_full_file_downloaded = chunk_size == size + self.sample_video_extract_full_file_name = temp_name + self.sample_video_file_full_file_name = os.path.join(path, name) + break + + if dt is None: + logging.warning("Scanner failed to extract date time metadata from %s on %s", + name, self.camera.display_name) + else: + self.file_mdatatime[os.path.join(path, name)] = float(dt.timestamp()) + logging.info("Extracted date time value %s for %s on %s", dt, name, + self.camera_display_name) + return SampleMetadata(dt, determined_by) + + def sample_non_camera_metadata(self, path: str, + name: str, + full_file_name: str, + ext_type: FileExtension) -> SampleMetadata: + """ + Extract sample metadata datetime from a photo or video not on a camera + """ + + dt = determined_by = None + if ext_type == FileExtension.jpeg: + determined_by = 'jpeg' + elif ext_type == FileExtension.raw: + determined_by = 'RAW' + elif ext_type == FileExtension.video: + determined_by = 'video' + + if ext_type == FileExtension.video: + with ExifTool() as et_process: + metadata = metadatavideo.MetaData(full_file_name, et_process) + self.sample_video_file_full_file_name = os.path.join(path, name) + dt = metadata.date_time(missing=None) + else: + # photo - we don't care if jpeg or RAW + try: + with stdchannel_redirected(sys.stderr, os.devnull): + metadata = metadataphoto.MetaData(full_file_name=full_file_name) + except Exception: + logging.warning("Scanner failed to load metadata from %s on %s", name, + self.display_name) + uri = get_uri(full_file_name=full_file_name) + self.problems.append(FileMetadataLoadProblem(uri=uri, name=name)) + else: + self.sample_exif_source = ExifSource.actual_file + self.sample_photo_file_full_file_name = os.path.join(path, name) + dt = metadata.date_time(missing=None) # type: datetime + + if dt is None: + logging.warning("Scanner failed to extract date time metadata from %s on %s", + name, self.display_name) + else: + self.file_mdatatime[full_file_name] = dt.timestamp() + return SampleMetadata(dt, determined_by) + + def examine_sample_non_camera_file(self, dirname: str, + name: str, + full_file_name: str, + ext_type: FileExtension) -> bool: + """ + Examine the the sample file to extract its metadata and compare it + against the file system modificaton time + """ + + logging.debug("Examining sample %s", full_file_name) + sample = self.sample_non_camera_metadata(dirname, name, full_file_name, ext_type) + if sample.datetime is not None: + self.file_mdatatime[full_file_name] = sample.datetime.timestamp() + try: + mtime = os.path.getmtime(full_file_name) + except OSError as e: + logging.warning("Could not determine modification time for %s", + full_file_name) + uri = get_uri(full_file_name=full_file_name) + self.problems.append(FsMetadataReadProblem(uri=uri, name=name, exception=e)) + return False + else: + # Located sample file: examine + self.determine_device_timestamp_tz( + sample.datetime, mtime, sample.determined_by) + return True + + def distinguish_non_camera_device_timestamp(self, path: str) -> None: + """ + Attempt to determine the device's approach to timezones when it + store timestamps. + When determining how this device reports modification time, file + preference is (1) RAW, (2)jpeg, and finally least preferred is (3) + video -- a RAW is the least likely to be modified. + + NOTE: this creates a sample file for one type of file (RAW if present, + if not, then jpeg, if jpeg also not present, then video). However if + a raw / jpeg is found, then still need to create sample file for video. + """ + + logging.debug("Distinguishing approach to timestamp time zones on %s", self.display_name) + + self.device_timestamp_type = DeviceTimestampTZ.unknown + + max_attempts = 10 + raw_attempts = 0 + jpegs_and_videos = defaultdict(deque) + + for dir_name, name in self.walk_file_system(path): + full_file_name = os.path.join(dir_name, name) + ext_type = rpdfile.extension_type(os.path.splitext(full_file_name)[1].lower()[1:]) + if ext_type in (FileExtension.raw, FileExtension.jpeg, FileExtension.video): + if ext_type == FileExtension.raw and raw_attempts < max_attempts: + # examine right away + raw_attempts += 1 + if self.examine_sample_non_camera_file(dirname=dir_name, name=name, + full_file_name=full_file_name, ext_type=ext_type): + return + else: + if len(jpegs_and_videos[ext_type]) < max_attempts: + jpegs_and_videos[ext_type].append((dir_name, name, full_file_name)) + + if len(jpegs_and_videos[FileExtension.jpeg]) == max_attempts: + break + + # Couldn't locate sample raw file. Are left with up to max_attempts jpeg and video files + for ext_type in (FileExtension.jpeg, FileExtension.video): + for dir_name, name, full_file_name in jpegs_and_videos[ext_type]: + if self.examine_sample_non_camera_file(dirname=dir_name, name=name, + full_file_name=full_file_name, ext_type=ext_type): + return + + def determine_device_timestamp_tz(self, mdatatime: datetime, + modification_time: Union[int, float], + determined_by: str) -> None: + """ + Compare metadata time with file modification time in an attempt + to determine the device's approach to timezones when it stores timestamps. + + :param mdatatime: file's metadata time + :param modification_time: file's file system modification time + :param determined_by: simple string used in log messages + """ + + if mdatatime is None: + logging.debug("Could not determine Device timezone setting for %s", + self.display_name) + self.device_timestamp_type = DeviceTimestampTZ.unknown + + # Must not compare exact times, as there can be a few seconds difference between + # when a file was saved to the flash memory and when it was created in the + # camera's memory. Allow for two minutes, to be safe. + if datetime_roughly_equal(dt1=datetime.utcfromtimestamp(modification_time), + dt2=mdatatime): + logging.info("Device timezone setting for %s is UTC, as indicated by %s file", + self.display_name, determined_by) + self.device_timestamp_type = DeviceTimestampTZ.is_utc + elif datetime_roughly_equal(dt1=datetime.fromtimestamp(modification_time), + dt2=mdatatime): + logging.info("Device timezone setting for %s is local time, as indicated by " + "%s file", self.display_name, determined_by) + self.device_timestamp_type = DeviceTimestampTZ.is_local + else: + logging.info("Device timezone setting for %s is unknown, because the file " + "modification time and file's time as recorded in metadata differ for " + "sample file %s", + self.display_name, determined_by) + self.device_timestamp_type = DeviceTimestampTZ.unknown + + def adjusted_mtime(self, mtime: float) -> float: + """ + Use the same calculated mtime that will be applied when the mtime + is saved in the rpd_file + + :param mtime: raw modification time + :return: modification time adjusted, if needed + """ + + if self.device_timestamp_type == DeviceTimestampTZ.is_utc: + return datetime.utcfromtimestamp(mtime).timestamp() + else: + return mtime + + def _get_associate_file_from_camera(self, base_name: str, + associate_files: defaultdict, camera_file: CameraFile) -> Optional[str]: + for path, ext in associate_files[base_name]: + if path in self._camera_directories_for_file[camera_file]: + return '{}.{}'.format(os.path.join(path, base_name),ext) + return None + + def get_video_THM_file(self, base_name: str, camera_file: CameraFile) -> Optional[str]: + """ + Checks to see if a thumbnail file (THM) with the same base name + is in the same directory as the file. + + :param base_name: the file name without the extension + :return: filename, including path, if found, else returns None + """ + + if self.download_from_camera: + return self._get_associate_file_from_camera(base_name, + self._camera_video_thumbnails, camera_file) + else: + return self._get_associate_file(base_name, rpdfile.VIDEO_THUMBNAIL_EXTENSIONS) + + def get_audio_file(self, base_name: str, camera_file: CameraFile) -> Optional[str]: + """ + Checks to see if an audio file with the same base name + is in the same directory as the file. + + :param base_name: the file name without the extension + :return: filename, including path, if found, else returns None + """ + + if self.download_from_camera: + return self._get_associate_file_from_camera( + base_name, self._camera_audio_files, camera_file + ) + else: + return self._get_associate_file(base_name, rpdfile.AUDIO_EXTENSIONS) + + def get_xmp_file(self, base_name: str, camera_file: CameraFile) -> Optional[str]: + """ + Checks to see if an XMP file with the same base name + is in the same directory as tthe file. + + :param base_name: the file name without the extension + :return: filename, including path, if found, else returns None + """ + if self.download_from_camera: + return self._get_associate_file_from_camera( + base_name, self._camera_xmp_files, camera_file + ) + else: + return self._get_associate_file(base_name, ['XMP']) + + def _get_associate_file(self, base_name: str, extensions_to_check: List[str]) -> Optional[str]: + full_file_name_no_ext = os.path.join(self.dir_name, base_name) + for e in extensions_to_check: + possible_file = '{}.{}'.format(full_file_name_no_ext, e) + if os.path.exists(possible_file): + return possible_file + possible_file = '{}.{}'.format(full_file_name_no_ext, e.upper()) + if os.path.exists(possible_file): + return possible_file + return None + + def cleanup_pre_stop(self): + if self.camera is not None: + self.camera.free_camera() + self.send_problems() + + @property + def camera_details(self) -> Optional[CameraDetails]: + return self._camera_details + + @camera_details.setter + def camera_details(self, index: Optional[int]) -> None: + """ + :param index: index into the storage details, for cameras with more than one + storage + """ + + if not self.camera_storage_descriptions: + self.camera_storage_descriptions = self.camera.get_storage_descriptions() + + if not self.camera_storage_descriptions: + # Problem: there are no descriptions for the storage + self._camera_details = CameraDetails( + model=self.camera_model, port=self.camera_port, + display_name=self.camera_display_name, + is_mtp=self.is_mtp_device, storage_desc=[] + ) + return + + index = index or 0 + + self._camera_details = CameraDetails( + model=self.camera_model, port=self.camera_port, display_name=self.camera_display_name, + is_mtp=self.is_mtp_device, storage_desc=self.camera_storage_descriptions[index] + ) + + +def trace_lines(frame, event, arg): + if event != 'line': + return + co = frame.f_code + func_name = co.co_name + line_no = frame.f_lineno + print('%s >>>>>>>>>>>>> At %s line %s' % (datetime.now().ctime(), func_name, line_no)) + +def trace_calls(frame, event, arg): + if event != 'call': + return + co = frame.f_code + func_name = co.co_name + if func_name in ('write', '__getattribute__'): + return + func_line_no = frame.f_lineno + func_filename = co.co_filename + caller = frame.f_back + if caller is not None: + caller_line_no = caller.f_lineno + caller_filename = caller.f_code.co_filename + else: + caller_line_no = caller_filename = '' + print('% s Call to %s on line %s of %s from line %s of %s' % + (datetime.now().ctime(), func_name, func_line_no, func_filename, caller_line_no, + caller_filename)) + + for f in ('distingish_non_camera_device_timestamp','determine_device_timestamp_tz'): + if func_name.find(f) >= 0: + # Trace into this function + return trace_lines + +if __name__ == "__main__": + if os.getenv('RPD_SCAN_DEBUG') is not None: + sys.settrace(trace_calls) + scan = ScanWorker() + + |