summaryrefslogtreecommitdiff
path: root/raphodo/scan.py
diff options
context:
space:
mode:
Diffstat (limited to 'raphodo/scan.py')
-rwxr-xr-xraphodo/scan.py1241
1 files changed, 1241 insertions, 0 deletions
diff --git a/raphodo/scan.py b/raphodo/scan.py
new file mode 100755
index 0000000..1a6de86
--- /dev/null
+++ b/raphodo/scan.py
@@ -0,0 +1,1241 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2011-2017 Damon Lynch <damonlynch@gmail.com>
+
+# This file is part of Rapid Photo Downloader.
+#
+# Rapid Photo Downloader is free software: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Rapid Photo Downloader is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Rapid Photo Downloader. If not,
+# see <http://www.gnu.org/licenses/>.
+
+"""
+Scans directory looking for photos and videos, and any associated files
+external to the actual photo/video including thumbnail files, XMP files, and
+audio files that are linked to a photo.
+
+Returns results using the 0mq pipeline pattern.
+
+Photo and movie metadata is (for the most part) not read during this
+scan process, because doing so is too slow. However, as part of scanning a
+device, there are two aspects to metadata that are in fact needed:
+
+1. A sample of photo and video metadata, that is used to demonstrate file
+ renaming. That is one sample photo, and one sample video.
+
+2. The device's time zone must be determined, as camera handle their time
+ zone setting differently from phones, and results can be unpredictable.
+ Therefore need to analyze the created date time metadata of a file the
+ device and compare it against the file modification time on the file system
+ or more importantly, gphoto2. It's not an exact science and there are
+ problems, but doing this is better than not doing it at all.
+
+"""
+
+__author__ = 'Damon Lynch'
+__copyright__ = "Copyright 2011-2017, Damon Lynch"
+
+import os
+import sys
+import pickle
+import logging
+from collections import (namedtuple, defaultdict, deque)
+from datetime import datetime
+import tempfile
+import operator
+
+if sys.version_info < (3,5):
+ import scandir
+ walk = scandir.walk
+else:
+ walk = os.walk
+from typing import List, Dict, Union, Optional, Iterator, Tuple, DefaultDict
+
+import gphoto2 as gp
+
+# Instances of classes ScanArguments and ScanPreferences are passed via pickle
+# Thus do not remove these two imports
+from raphodo.interprocess import ScanArguments
+from raphodo.preferences import ScanPreferences, Preferences
+from raphodo.interprocess import (WorkerInPublishPullPipeline, ScanResults,
+ ScanArguments)
+from raphodo.camera import Camera, CameraError, CameraProblemEx
+import raphodo.rpdfile as rpdfile
+from raphodo.constants import (DeviceType, FileType, DeviceTimestampTZ, CameraErrorCode,
+ FileExtension, ThumbnailCacheDiskStatus, all_tags_offset, ExifSource)
+from raphodo.rpdsql import DownloadedSQL, FileDownloaded
+from raphodo.cache import ThumbnailCacheSql
+from raphodo.utilities import (stdchannel_redirected, datetime_roughly_equal,
+ GenerateRandomFileName, format_size_for_user)
+from raphodo.exiftool import ExifTool
+import raphodo.metadatavideo as metadatavideo
+import raphodo.metadataphoto as metadataphoto
+from raphodo.problemnotification import (
+ ScanProblems, UnhandledFileProblem, CameraDirectoryReadProblem, CameraFileInfoProblem,
+ CameraFileReadProblem, FileMetadataLoadProblem, FileWriteProblem, FsMetadataReadProblem,
+ FileZeroLengthProblem
+)
+from raphodo.storage import get_uri, CameraDetails
+
+FileInfo = namedtuple('FileInfo', 'path modification_time size ext_lower base_name file_type')
+CameraFile = namedtuple('CameraFile', 'name size')
+CameraMetadataDetails = namedtuple('CameraMetadataDetails',
+ 'path name size extension mtime file_type')
+SampleMetadata = namedtuple('SampleMetadata', 'datetime determined_by')
+
+
+class ScanWorker(WorkerInPublishPullPipeline):
+
+ def __init__(self):
+ self.downloaded = DownloadedSQL()
+ self.thumbnail_cache = ThumbnailCacheSql()
+ self.no_previously_downloaded = 0
+ self.file_batch = []
+ self.batch_size = 50
+ self.file_type_counter = rpdfile.FileTypeCounter()
+ self.file_size_sum = rpdfile.FileSizeSum()
+ self.device_timestamp_type = DeviceTimestampTZ.undetermined
+
+ # full_file_name (path+name):timestamp
+ self.file_mdatatime = {} # type: Dict[str, float]
+
+ self.sample_exif_bytes = None # type: bytes
+ self.sample_exif_source = None # type: ExifSource
+ self.sample_photo = None # type: rpdfile.Photo
+ self.sample_video = None # type: rpdfile.Video
+ self.sample_video_extract_full_file_name = None # type: Optional[str]
+ self.sample_photo_file_full_file_name = None # type: Optional[str]
+ self.sample_video_file_full_file_name = None # type: Optional[str]
+ self.sample_video_full_file_downloaded = None # type: Optional[bool]
+ self.found_sample_photo = False
+ self.found_sample_video = False
+
+ self.prefs = Preferences()
+ self.scan_preferences = ScanPreferences(self.prefs.ignored_paths)
+
+ self.problems = ScanProblems()
+
+ self._camera_details = None # type: Optional[CameraDetails]
+
+ super().__init__('Scan')
+
+ def do_work(self) -> None:
+ try:
+ self.do_scan()
+ except Exception as e:
+ try:
+ device = self.display_name
+ except AttributeError:
+ device = ''
+ logging.exception("Unexpected exception while scanning %s", device)
+
+ self.content = pickle.dumps(
+ ScanResults(scan_id=int(self.worker_id), fatal_error=True),
+ pickle.HIGHEST_PROTOCOL
+ )
+ self.send_message_to_sink()
+ self.disconnect_logging()
+ self.send_finished_command()
+
+ def do_scan(self) -> None:
+ logging.debug("Scan {} worker started".format(self.worker_id.decode()))
+
+ scan_arguments = pickle.loads(self.content) # type: ScanArguments
+ if scan_arguments.log_gphoto2:
+ gp.use_python_logging()
+
+ if scan_arguments.ignore_other_types:
+ rpdfile.PHOTO_EXTENSIONS_SCAN = rpdfile.PHOTO_EXTENSIONS_WITHOUT_OTHER
+
+ self.device = scan_arguments.device
+
+ self.download_from_camera = scan_arguments.device.device_type == DeviceType.camera
+ self.camera_storage_descriptions = []
+ if self.download_from_camera:
+ self.camera_model = scan_arguments.device.camera_model
+ self.camera_port = scan_arguments.device.camera_port
+ self.is_mtp_device = scan_arguments.device.is_mtp_device
+ self.camera_display_name = scan_arguments.device.display_name
+ self.display_name = self.camera_display_name
+ self.ignore_mdatatime_for_mtp_dng = self.is_mtp_device and \
+ self.prefs.ignore_mdatatime_for_mtp_dng
+ else:
+ self.camera_port = self.camera_model = self.is_mtp_device = None
+ self.ignore_mdatatime_for_mtp_dng = False
+ self.camera_display_name = None
+
+ self.files_scanned = 0
+ self.camera = None
+
+ if not self.download_from_camera:
+ # Download from file system
+ path = os.path.abspath(scan_arguments.device.path)
+ if not self.prefs.device_without_dcim_autodetection and \
+ scan_arguments.device.device_type == DeviceType.volume:
+ path = os.path.join(path, "DCIM")
+ self.display_name = scan_arguments.device.display_name
+ # Scan the files using lightweight high-performance scandir
+ logging.info("Scanning {}".format(self.display_name))
+
+ self.problems.uri = get_uri(path=path)
+ self.problems.name = self.display_name
+
+ # Before doing anything else, determine time zone approach
+ # Need two different walks because first folder of files
+ # might be videos, then the 2nd folder photos, etc.
+ self.distinguish_non_camera_device_timestamp(path)
+
+ if self.scan_preferences.scan_this_path(path):
+ for dir_name, name in self.walk_file_system(path):
+ self.dir_name = dir_name
+ self.file_name = name
+ self.process_file()
+
+ else:
+ # scanning directly from camera
+ have_optimal_display_name = scan_arguments.device.have_optimal_display_name
+ while True:
+ try:
+ self.camera = Camera(model=scan_arguments.device.camera_model,
+ port=scan_arguments.device.camera_port,
+ raise_errors=True)
+ if not have_optimal_display_name:
+ # Update the GUI with the real name of the camera
+ # and its storage information
+ have_optimal_display_name = True
+ self.camera_display_name = self.camera.display_name
+ self.display_name = self.camera_display_name
+ storage_space = self.camera.get_storage_media_capacity(refresh=True)
+ storage_descriptions = self.camera.get_storage_descriptions()
+ self.content = pickle.dumps(
+ ScanResults(
+ optimal_display_name=self.camera_display_name,
+ storage_space=storage_space,
+ storage_descriptions=storage_descriptions,
+ scan_id=int(self.worker_id),
+ ),
+ pickle.HIGHEST_PROTOCOL
+ )
+ self.send_message_to_sink()
+ break
+ except CameraProblemEx as e:
+ self.content = pickle.dumps(ScanResults(
+ error_code=e.code,
+ scan_id=int(self.worker_id)),
+ pickle.HIGHEST_PROTOCOL)
+ self.send_message_to_sink()
+ # Wait for command to resume or halt processing
+ self.resume_work()
+
+ if self.download_from_camera:
+ self.camera_details = 0
+ self.problems.uri = get_uri(camera_details=self.camera_details)
+ self.problems.name = self.display_name
+
+ if self.ignore_mdatatime_for_mtp_dng:
+ logging.info("For any DNG files on the %s, when determining the creation date/"
+ "time, the metadata date/time will be ignored, and the file "
+ "modification date/time used instead", self.display_name)
+
+ # Download only from the DCIM folder(s) in the camera.
+ # Phones especially have many directories with images, which we
+ # must ignore
+ if self.camera.camera_has_dcim():
+ logging.info("Scanning {}".format(self.display_name))
+ self._camera_folders_and_files = []
+ self._camera_file_names = defaultdict(list)
+ self._camera_audio_files = defaultdict(list)
+ self._camera_video_thumbnails = defaultdict(list)
+ self._camera_xmp_files = defaultdict(list)
+ self._folder_identifiers = {}
+ self._folder_identifers_for_file = \
+ defaultdict(list) # type: DefaultDict[int, List[int]]
+ self._camera_directories_for_file = defaultdict(list)
+ self._camera_photos_videos_by_type = \
+ defaultdict(list) # type: DefaultDict[FileExtension, List[CameraMetadataDetails]]
+
+ dcim_folders = self.camera.dcim_folders
+
+ if len(dcim_folders) > 1:
+ # This camera has dual memory cards.
+ # Give each folder an numeric identifier that will be
+ # used to identify which card a given file comes from
+ dcim_folders.sort()
+ for idx, folder in enumerate(dcim_folders):
+ self._folder_identifiers[folder] = idx + 1
+
+ # locate photos and videos, identifying duplicate files
+ # identify candidates for extracting metadata
+ for idx, dcim_folder in enumerate(dcim_folders):
+ # Setup camera details for each storage space in the camera
+ self.camera_details = idx
+ # Now initialize the problems container, if not already done so
+ if idx:
+ self.problems.name = self.camera_display_name
+ self.problems.uri = get_uri(camera_details=self.camera_details)
+
+ logging.debug("Scanning %s on %s", dcim_folder, self.camera.display_name)
+ folder_identifier = self._folder_identifiers.get(dcim_folder)
+ basedir = dcim_folder[:-len('/DCIM')]
+ self.locate_files_on_camera(dcim_folder, folder_identifier, basedir)
+
+ # extract non camera metadata
+ if self._camera_photos_videos_by_type:
+ self.identify_camera_tz_and_sample_files()
+
+ # now, process each file
+ for self.dir_name, self.file_name in self._camera_folders_and_files:
+ self.process_file()
+ else:
+ logging.warning("Unable to detect any DCIM folders on %s", self.display_name)
+
+ self.camera.free_camera()
+
+ if self.file_batch:
+ # Send any remaining files, including the sample photo or video
+ self.content = pickle.dumps(
+ ScanResults(
+ self.file_batch,
+ self.file_type_counter,
+ self.file_size_sum,
+ sample_photo=self.sample_photo,
+ sample_video=self.sample_video
+ ),
+ pickle.HIGHEST_PROTOCOL
+ )
+ self.send_message_to_sink()
+
+ self.send_problems()
+
+ if self.files_scanned > 0 and not (self.files_scanned == 0 and self.download_from_camera):
+ logging.info("{} total files scanned on {}".format(self.files_scanned,
+ self.display_name))
+
+ self.disconnect_logging()
+ self.send_finished_command()
+
+ def send_problems(self) -> None:
+ if self.problems:
+ self.content = pickle.dumps(
+ ScanResults(
+ scan_id=int(self.worker_id), problems=self.problems
+ ),
+ pickle.HIGHEST_PROTOCOL
+ )
+ self.send_message_to_sink()
+
+ def walk_file_system(self, path_to_walk: str) -> Iterator[Tuple[str, str]]:
+ """
+ Return files on local file system, ignoring those in directories
+ the user doesn't want scanned
+ :param path_to_walk: the path to scan
+ """
+
+ for dir_name, dir_list, file_list in walk(path_to_walk):
+ if len(dir_list) > 0:
+ if self.scan_preferences.ignored_paths:
+ # Don't inspect paths the user wants ignored
+ # Altering subdirs in place controls the looping
+ # [:] ensures the list is altered in place
+ # (mutating slice method)
+ dir_list[:] = filter(self.scan_preferences.scan_this_path, dir_list)
+ for name in file_list:
+ yield dir_name, name
+
+ def locate_files_on_camera(self, path: str, folder_identifier: int, basedir: str) -> None:
+ """
+ Scans the memory card(s) on the camera for photos, videos,
+ audio files, and video thumbnail (THM) files. Looks only in the
+ camera's DCIM folders, which are assumed to have already been
+ located.
+
+ We cannot assume file names are unique on any one memory card,
+ as although it's unlikely, it's possible that a file with
+ the same name might be in different subfolders.
+
+ For cameras with two memory cards, there are two broad
+ possibilities:
+
+ (!) the cards' contents mirror each other, because the camera
+ writes the same files to both cards simultaneously
+
+ (2) each card has a different set of files, e.g. because a
+ different file type is written to each card, or the 2nd card is
+ used only when the first is full
+
+ In practice, we have to assume that if there are two memory
+ cards, some files will be identical, and others different. Thus
+ we have to scan the contents of both cards, analyzing file
+ names, file modification times and file sizes.
+
+ If a camera has more than one memory card, we store which
+ card the file came from using a simple numeric identifier i.e.
+ 1 or 2.
+
+ For duplicate files, we record both directories the file is
+ stored on.
+
+ :param path: the path on the camera to analyze for files and
+ folders
+ :param folder_identifier: if not None, then indicates (1) the
+ camera being scanned has more than one memory card, and (2)
+ the simple numeric identifier of the memory card being
+ scanned right now
+ :param basedir: the base directory of the path, as reported by
+ libgphoto2
+ """
+
+ files_in_folder = []
+ names = []
+ try:
+ files_in_folder = self.camera.camera.folder_list_files(path, self.camera.context)
+ except gp.GPhoto2Error as e:
+ logging.error("Unable to scan files on camera: error %s", e.code)
+ uri = get_uri(path=path, camera_details=self.camera_details)
+ self.problems.append(CameraDirectoryReadProblem(uri=uri, name=path, gp_code=e.code))
+
+ if files_in_folder:
+ # Distinguish the file type for every file in the folder
+ names = [name for name, value in files_in_folder]
+ split_names = [os.path.splitext(name) for name in names]
+ # Remove the period from the extension
+ exts = [ext[1:] for name, ext in split_names]
+ exts_lower = [ext.lower() for ext in exts]
+ ext_types = [rpdfile.extension_type(ext) for ext in exts_lower]
+
+ for idx, name in enumerate(names):
+ # Check to see if the process has received a command to terminate
+ # or pause
+ self.check_for_controller_directive()
+
+ # Get the information we extracted above
+ base_name = split_names[idx][0]
+ ext = exts[idx]
+ ext_lower = exts_lower[idx]
+ ext_type = ext_types[idx]
+ file_type = rpdfile.file_type(ext_lower)
+
+ if file_type is not None:
+ # file is a photo or video
+ file_is_unique = True
+ try:
+ modification_time, size = self.camera.get_file_info(path, name)
+ except gp.GPhoto2Error as e:
+ logging.error(
+ "Unable to access modification_time or size from %s on %s. Error code: %s",
+ os.path.join(path, name), self.display_name, e.code
+ )
+ modification_time, size = 0, 0
+ uri = get_uri(
+ full_file_name=os.path.join(path, name), camera_details=self.camera_details
+ )
+ self.problems.append(CameraFileInfoProblem(uri=uri, gp_code=e.code))
+ else:
+ if size <= 0:
+ full_file_name = os.path.join(path, name)
+ logging.error(
+ "Zero length file %s will not be downloaded from %s",
+ full_file_name, self.display_name
+ )
+ uri = get_uri(
+ full_file_name=full_file_name, camera_details=self.camera_details
+ )
+ self.problems.append(FileZeroLengthProblem(name=name, uri=uri))
+
+ if size > 0:
+ key = rpdfile.make_key(file_type, basedir)
+ self.file_type_counter[key] += 1
+ self.file_size_sum[key] += size
+
+ # Store the directory this file is stored in, used when
+ # determining if associate files are part of the download
+ cf = CameraFile(name=name, size=size)
+ self._camera_directories_for_file[cf].append(path)
+
+ if folder_identifier is not None:
+ # Store which which card the file came from using a
+ # simple numeric identifier i.e. 1 or 2.
+ self._folder_identifers_for_file[cf].append(folder_identifier)
+
+ if name in self._camera_file_names:
+ for existing_file_info in self._camera_file_names[name]:
+ # Don't compare file modification time in this
+ # comparison, because files can be written to
+ # different cards several seconds apart when
+ # the write speeds of the cards differ
+ if existing_file_info.size == size:
+ file_is_unique = False
+ break
+ if file_is_unique:
+ file_info = FileInfo(
+ path=path, modification_time=modification_time,
+ size=size, file_type=file_type, base_name=base_name,
+ ext_lower=ext_lower
+ )
+ metadata_details = CameraMetadataDetails(
+ path=path, name=name, size=size, extension=ext_lower,
+ mtime=modification_time, file_type=file_type
+ )
+ self._camera_file_names[name].append(file_info)
+ self._camera_folders_and_files.append([path, name])
+ self._camera_photos_videos_by_type[ext_type].append(metadata_details)
+ else:
+ # this file on the camera is not a photo or video
+ if ext_lower in rpdfile.AUDIO_EXTENSIONS:
+ self._camera_audio_files[base_name].append((path, ext))
+ elif ext_lower in rpdfile.VIDEO_THUMBNAIL_EXTENSIONS:
+ self._camera_video_thumbnails[base_name].append((path, ext))
+ elif ext_lower == 'xmp':
+ self._camera_xmp_files[base_name].append((path, ext))
+ else:
+ logging.info("Ignoring unknown file %s on %s",
+ os.path.join(path, name), self.display_name)
+ if self.prefs.warn_about_unknown_file(ext=ext):
+ uri = get_uri(
+ full_file_name=os.path.join(path, name),
+ camera_details=self.camera_details
+ )
+ self.problems.append(UnhandledFileProblem(name=name, uri=uri))
+ folders = []
+ try:
+ for name, value in self.camera.camera.folder_list_folders(path, self.camera.context):
+ if self.scan_preferences.scan_this_path(os.path.join(path, name)):
+ folders.append(name)
+ except gp.GPhoto2Error as e:
+ logging.error("Unable to scan files on %s. Error code: %s", self.display_name, e.code)
+ uri = get_uri(path=path, camera_details=self.camera_details)
+ self.problems.append(CameraDirectoryReadProblem(uri=uri, name=path, gp_code=e.code))
+
+ # recurse over subfolders
+ for name in folders:
+ self.locate_files_on_camera(os.path.join(path, name), folder_identifier, basedir)
+
+ def identify_camera_tz_and_sample_files(self) -> None:
+ """
+ Get sample metadata for photos and videos, and determine device timezone setting.
+ """
+
+ # do in place sort of jpegs, RAWs and videos by file size
+ for files in self._camera_photos_videos_by_type.values():
+ files.sort(key=operator.attrgetter('size'))
+
+ # When determining how a camera reports modification time, extraction order
+ # of preference is (1) jpeg, (2) RAW, and finally least preferred is (3) video
+ # However, if ignore_mdatatime_for_mtp_dng is set, ignore the RAW files
+
+ if not self.ignore_mdatatime_for_mtp_dng:
+ order = (FileExtension.jpeg, FileExtension.raw, FileExtension.video)
+ else:
+ order = (FileExtension.jpeg, FileExtension.video, FileExtension.raw)
+
+ have_photos = len(self._camera_photos_videos_by_type[FileExtension.raw]) > 0 or \
+ len(self._camera_photos_videos_by_type[FileExtension.jpeg]) > 0
+ have_videos = len(self._camera_photos_videos_by_type[FileExtension.video]) > 0
+
+ max_attempts = 5
+ for ext_type in order:
+ for file in self._camera_photos_videos_by_type[ext_type][:max_attempts]: \
+ # type: CameraMetadataDetails
+ get_tz = self.device_timestamp_type == DeviceTimestampTZ.undetermined and not (
+ self.ignore_mdatatime_for_mtp_dng and ext_type == FileExtension.raw)
+ get_sample_metadata = (
+ file.file_type == FileType.photo and self.sample_exif_source is None) or (
+ file.file_type == FileType.video and
+ self.sample_video_extract_full_file_name is None)
+
+ if get_tz or get_sample_metadata:
+ logging.info("Extracting sample %s metadata for %s",
+ file.file_type.name, self.camera_display_name)
+ sample = self.sample_camera_metadata(
+ path=file.path, name=file.name, ext_type=ext_type, extension=file.extension,
+ modification_time=file.mtime, size=file.size)
+ if get_tz:
+ self.determine_device_timestamp_tz(sample.datetime, file.mtime,
+ sample.determined_by)
+ need_sample_photo = self.sample_exif_source is None and have_photos
+ need_sample_video = self.sample_video_extract_full_file_name is None and \
+ have_videos
+ if not (need_sample_photo or need_sample_video):
+ break
+
+ def process_file(self) -> None:
+ # Check to see if the process has received a command to terminate or
+ # pause
+ self.check_for_controller_directive()
+
+ file = os.path.join(self.dir_name, self.file_name)
+
+ # do we have permission to read the file?
+ if self.download_from_camera or os.access(file, os.R_OK):
+
+ # count how many files of each type are included
+ # i.e. how many photos and videos
+ self.files_scanned += 1
+ if not self.files_scanned % 10000:
+ logging.info("Scanned {} files".format(
+ self.files_scanned))
+
+ if not self.download_from_camera:
+ base_name, ext = os.path.splitext(self.file_name)
+ ext = ext.lower()[1:]
+ file_type = rpdfile.file_type(ext)
+
+ # For next code block, see comment in
+ # self.distinguish_non_camera_device_timestamp()
+ # This only applies to files being scanned on the file system, not
+ # cameras / phones.
+ if file_type == FileType.photo and self.sample_exif_source is None:
+ # this should never happen due to photos being prioritized over videos
+ # with respect to time zone determination
+ logging.error("Sample metadata not extracted from photo %s although it should "
+ "have been used to determine the device timezone", self.file_name)
+ elif file_type == FileType.video and self.sample_video_file_full_file_name is None:
+ self.sample_non_camera_metadata(self.dir_name, self.file_name, file,
+ FileExtension.video)
+ else:
+ base_name = None
+ for file_info in self._camera_file_names[self.file_name]:
+ if file_info.path == self.dir_name:
+ base_name = file_info.base_name
+ ext = file_info.ext_lower
+ file_type = file_info.file_type
+ break
+ assert base_name is not None
+
+ if file_type is not None:
+ self.file_type_counter[file_type] += 1
+
+ if self.download_from_camera:
+ modification_time = file_info.modification_time
+ # zero length files have already been filtered out
+ size = file_info.size
+ camera_file = CameraFile(name=self.file_name, size=size)
+ else:
+ stat = os.stat(file)
+ size = stat.st_size
+ if size <= 0:
+ logging.error(
+ "Zero length file %s will not be downloaded from %s",
+ file, self.display_name
+ )
+ uri = get_uri(full_file_name=file)
+ self.problems.append(FileZeroLengthProblem(name=self.file_name, uri=uri))
+ return
+ modification_time = stat.st_mtime
+ camera_file = None
+
+ self.file_size_sum[file_type] += size
+
+ # look for thumbnail file (extension THM) for videos
+ if file_type == FileType.video:
+ thm_full_name = self.get_video_THM_file(base_name, camera_file)
+ else:
+ thm_full_name = None
+
+ # check if an XMP file is associated with the photo or video
+ xmp_file_full_name = self.get_xmp_file(base_name, camera_file)
+
+ # check if an audio file is associated with the photo or video
+ audio_file_full_name = self.get_audio_file(base_name, camera_file)
+
+ # has the file been downloaded previously?
+ # note: we should use the adjusted mtime, not the raw one
+ adjusted_mtime = self.adjusted_mtime(modification_time)
+
+ downloaded = self.downloaded.file_downloaded(
+ name=self.file_name,
+ size=size,
+ modification_time=adjusted_mtime)
+
+ thumbnail_cache_status = ThumbnailCacheDiskStatus.unknown
+
+ # Assign metadata time, if we have it
+ # If we don't, it will be extracted when thumbnails are generated
+ mdatatime = self.file_mdatatime.get(file, 0.0)
+
+ ignore_mdatatime = self.ignore_mdatatime(ext=ext)
+
+ if not mdatatime and self.prefs.use_thumbnail_cache and not ignore_mdatatime:
+ # Was there a thumbnail generated for the file?
+ # If so, get the metadata date time from that
+ get_thumbnail = self.thumbnail_cache.get_thumbnail_path(
+ full_file_name=file, mtime=adjusted_mtime,
+ size=size, camera_model=self.camera_model
+ )
+ thumbnail_cache_status = get_thumbnail.disk_status
+ if thumbnail_cache_status in (
+ ThumbnailCacheDiskStatus.found, ThumbnailCacheDiskStatus.failure):
+ mdatatime = get_thumbnail.mdatatime
+
+ if downloaded is not None:
+ self.no_previously_downloaded += 1
+ prev_full_name = downloaded.download_name
+ prev_datetime = downloaded.download_datetime
+ else:
+ prev_full_name = prev_datetime = None
+
+ if self.download_from_camera:
+ camera_memory_card_identifiers = self._folder_identifers_for_file[camera_file]
+ if not camera_memory_card_identifiers:
+ camera_memory_card_identifiers = None
+ else:
+ camera_memory_card_identifiers = None
+
+ problem=None
+
+ rpd_file = rpdfile.get_rpdfile(
+ name=self.file_name,
+ path=self.dir_name,
+ size=size,
+ prev_full_name=prev_full_name,
+ prev_datetime=prev_datetime,
+ device_timestamp_type=self.device_timestamp_type,
+ mtime=modification_time,
+ mdatatime=mdatatime,
+ thumbnail_cache_status=thumbnail_cache_status,
+ thm_full_name=thm_full_name,
+ audio_file_full_name=audio_file_full_name,
+ xmp_file_full_name=xmp_file_full_name,
+ scan_id=self.worker_id,
+ file_type=file_type,
+ from_camera=self.download_from_camera,
+ camera_details=self.camera_details,
+ camera_memory_card_identifiers=camera_memory_card_identifiers,
+ never_read_mdatatime=ignore_mdatatime,
+ device_display_name=self.display_name,
+ device_uri=self.device.uri,
+ raw_exif_bytes=None,
+ exif_source=None,
+ problem=problem
+ )
+
+ self.file_batch.append(rpd_file)
+
+ if not self.found_sample_photo and file == self.sample_photo_file_full_file_name:
+ self.sample_photo = self.create_sample_rpdfile(name=self.file_name,
+ path=self.dir_name,
+ size=size,
+ mdatatime=mdatatime,
+ file_type=FileType.photo,
+ mtime=modification_time,
+ ignore_mdatatime=ignore_mdatatime)
+ self.sample_exif_bytes = None
+ self.found_sample_photo = True
+
+ if not self.found_sample_video and file == self.sample_video_file_full_file_name:
+ self.sample_video = self.create_sample_rpdfile(name=self.file_name,
+ path=self.dir_name,
+ size=size,
+ mdatatime=mdatatime,
+ file_type=FileType.video,
+ mtime=modification_time,
+ ignore_mdatatime=ignore_mdatatime)
+ if self.sample_video_full_file_downloaded:
+ rpd_file.cache_full_file_name = self.sample_video_extract_full_file_name
+ self.sample_video_extract_full_file_name = None
+ self.found_sample_video = True
+
+ if len(self.file_batch) == self.batch_size:
+ self.content = pickle.dumps(ScanResults(
+ rpd_files=self.file_batch,
+ file_type_counter=self.file_type_counter,
+ file_size_sum=self.file_size_sum,
+ sample_photo=self.sample_photo,
+ sample_video=self.sample_video),
+ pickle.HIGHEST_PROTOCOL)
+ self.send_message_to_sink()
+ self.file_batch = []
+ self.sample_photo = None
+ self.sample_video = None
+
+ def send_message_to_sink(self) -> None:
+ try:
+ logging.debug(
+ "Sending %s scanned files from %s to sink", len(self.file_batch), self.display_name
+ )
+ except AttributeError:
+ pass
+ super().send_message_to_sink()
+
+ def ignore_mdatatime(self, ext: str) -> bool:
+ return self.ignore_mdatatime_for_mtp_dng and ext == 'dng'
+
+ def create_sample_rpdfile(self, path: str,
+ name: str,
+ size: int,
+ mdatatime: float,
+ file_type: FileType,
+ mtime: float,
+ ignore_mdatatime: bool) -> Union[rpdfile.Photo, rpdfile.Video]:
+ assert (self.sample_exif_source is not None and self.sample_photo_file_full_file_name or
+ self.sample_video_file_full_file_name is not None)
+ logging.info("Successfully extracted sample %s metadata from %s",
+ file_type.name, self.display_name)
+ problem=None
+ rpd_file = rpdfile.get_rpdfile(
+ name=name,
+ path=path,
+ size=size,
+ prev_full_name=None,
+ prev_datetime=None,
+ device_timestamp_type=self.device_timestamp_type,
+ mtime=mtime,
+ mdatatime=mdatatime,
+ thumbnail_cache_status=ThumbnailCacheDiskStatus.unknown,
+ thm_full_name=None,
+ audio_file_full_name=None,
+ xmp_file_full_name=None,
+ scan_id=self.worker_id,
+ file_type=file_type,
+ from_camera=self.download_from_camera,
+ camera_details=self.camera_details,
+ camera_memory_card_identifiers=None,
+ never_read_mdatatime=ignore_mdatatime,
+ device_display_name=self.display_name,
+ device_uri=self.device.uri,
+ raw_exif_bytes=self.sample_exif_bytes,
+ exif_source=self.sample_exif_source,
+ problem=problem
+ )
+ if file_type == FileType.video and self.download_from_camera:
+ # relevant only when downloading from a camera
+ rpd_file.temp_sample_full_file_name = self.sample_video_extract_full_file_name
+ rpd_file.temp_sample_is_complete_file = self.sample_video_full_file_downloaded
+
+ return rpd_file
+
+ def sample_camera_metadata(self, path: str,
+ name: str,
+ extension: str,
+ ext_type: FileExtension,
+ size: int,
+ modification_time: int) -> SampleMetadata:
+ """
+ Extract sample metadata, including specifically datetime, from a photo or video on a camera
+ Video files are special in that sometimes the entire file has to be read in order to extract
+ its metadata.
+ """
+
+ dt = determined_by = None
+ use_app1 = save_chunk = exif_extract = False
+ if ext_type == FileExtension.jpeg:
+ determined_by = 'jpeg'
+ if self.camera.can_fetch_thumbnails:
+ use_app1 = True
+ else:
+ exif_extract = True
+ elif ext_type == FileExtension.raw:
+ determined_by = 'RAW'
+ exif_extract = True
+ elif ext_type == FileExtension.video:
+ determined_by = 'video'
+ save_chunk = True
+
+ if use_app1:
+ try:
+ self.sample_exif_bytes = self.camera.get_exif_extract_from_jpeg(path, name)
+ except CameraProblemEx as e:
+ uri = get_uri(full_file_name=os.path.join(path, name),
+ camera_details=self.camera_details)
+ self.problems.append(CameraFileReadProblem(uri=uri, name=name, gp_code=e.gp_code))
+ else:
+ try:
+ with stdchannel_redirected(sys.stderr, os.devnull):
+ metadata = metadataphoto.MetaData(app1_segment=self.sample_exif_bytes)
+ except:
+ logging.warning("Scanner failed to load metadata from %s on %s", name,
+ self.camera.display_name)
+ self.sample_exif_bytes = None
+ uri = get_uri(full_file_name=os.path.join(path, name),
+ camera_details=self.camera_details)
+ self.problems.append(FileMetadataLoadProblem(uri=uri, name=name))
+ else:
+ self.sample_exif_source = ExifSource.app1_segment
+ self.sample_photo_file_full_file_name = os.path.join(path, name)
+ dt = metadata.date_time(missing=None) # type: datetime
+ elif exif_extract:
+ offset = all_tags_offset.get(extension)
+ if offset is None:
+ offset = size
+ offset = min(size, offset)
+ self.sample_exif_bytes = self.camera.get_exif_extract(path, name, offset)
+ if self.sample_exif_bytes is not None:
+ try:
+ with stdchannel_redirected(sys.stderr, os.devnull):
+ metadata = metadataphoto.MetaData(raw_bytes=self.sample_exif_bytes)
+ except:
+ logging.warning("Scanner failed to load metadata from %s on %s", name,
+ self.camera.display_name)
+ self.sample_exif_bytes = None
+ uri = get_uri(full_file_name=os.path.join(path, name),
+ camera_details=self.camera_details)
+ self.problems.append(FileMetadataLoadProblem(uri=uri, name=name))
+ else:
+ self.sample_exif_source = ExifSource.raw_bytes
+ self.sample_photo_file_full_file_name = os.path.join(path, name)
+ dt = metadata.date_time(missing=None) # type: datetime
+ else:
+ assert save_chunk
+ # video
+ offset = all_tags_offset.get(extension)
+ if offset is None:
+ max_size = 1024**2 * 20 # approx 21 MB
+ offset = min(size, max_size)
+
+ # First try offset value, and if it fails, read the entire video
+ # Reading the metadata on some videos will fail if the entire video
+ # is not read, e.g. an iPhone 5 video
+ temp_name = os.path.join(tempfile.gettempdir(),
+ GenerateRandomFileName().name(extension=extension))
+ with ExifTool() as et_process:
+ for chunk_size in (offset, size):
+ if chunk_size == size:
+ logging.debug("Downloading entire video for metadata sample (%s)",
+ format_size_for_user(size))
+ mtime = int(self.adjusted_mtime(float(modification_time)))
+ try:
+ self.camera.save_file_chunk(path, name, chunk_size, temp_name, mtime)
+ except CameraProblemEx as e:
+ if e.code == CameraErrorCode.read:
+ uri = get_uri(os.path.join(path, name),
+ camera_details=self.camera_details)
+ self.problems.append(CameraFileReadProblem(uri=uri, name=name,
+ gp_code=e.gp_code))
+ else:
+ assert e.code == CameraErrorCode.write
+ uri = get_uri(path=os.path.dirname(temp_name))
+ self.problems.append(FileWriteProblem(uri=uri, name=temp_name,
+ exception=e.py_exception))
+ else:
+ metadata = metadatavideo.MetaData(temp_name, et_process)
+ dt = metadata.date_time(missing=None, ignore_file_modify_date=True)
+ width = metadata.width(missing=None)
+ height = metadata.height(missing=None)
+ if dt is not None and width is not None and height is not None:
+ self.sample_video_full_file_downloaded = chunk_size == size
+ self.sample_video_extract_full_file_name = temp_name
+ self.sample_video_file_full_file_name = os.path.join(path, name)
+ break
+
+ if dt is None:
+ logging.warning("Scanner failed to extract date time metadata from %s on %s",
+ name, self.camera.display_name)
+ else:
+ self.file_mdatatime[os.path.join(path, name)] = float(dt.timestamp())
+ logging.info("Extracted date time value %s for %s on %s", dt, name,
+ self.camera_display_name)
+ return SampleMetadata(dt, determined_by)
+
+ def sample_non_camera_metadata(self, path: str,
+ name: str,
+ full_file_name: str,
+ ext_type: FileExtension) -> SampleMetadata:
+ """
+ Extract sample metadata datetime from a photo or video not on a camera
+ """
+
+ dt = determined_by = None
+ if ext_type == FileExtension.jpeg:
+ determined_by = 'jpeg'
+ elif ext_type == FileExtension.raw:
+ determined_by = 'RAW'
+ elif ext_type == FileExtension.video:
+ determined_by = 'video'
+
+ if ext_type == FileExtension.video:
+ with ExifTool() as et_process:
+ metadata = metadatavideo.MetaData(full_file_name, et_process)
+ self.sample_video_file_full_file_name = os.path.join(path, name)
+ dt = metadata.date_time(missing=None)
+ else:
+ # photo - we don't care if jpeg or RAW
+ try:
+ with stdchannel_redirected(sys.stderr, os.devnull):
+ metadata = metadataphoto.MetaData(full_file_name=full_file_name)
+ except Exception:
+ logging.warning("Scanner failed to load metadata from %s on %s", name,
+ self.display_name)
+ uri = get_uri(full_file_name=full_file_name)
+ self.problems.append(FileMetadataLoadProblem(uri=uri, name=name))
+ else:
+ self.sample_exif_source = ExifSource.actual_file
+ self.sample_photo_file_full_file_name = os.path.join(path, name)
+ dt = metadata.date_time(missing=None) # type: datetime
+
+ if dt is None:
+ logging.warning("Scanner failed to extract date time metadata from %s on %s",
+ name, self.display_name)
+ else:
+ self.file_mdatatime[full_file_name] = dt.timestamp()
+ return SampleMetadata(dt, determined_by)
+
+ def examine_sample_non_camera_file(self, dirname: str,
+ name: str,
+ full_file_name: str,
+ ext_type: FileExtension) -> bool:
+ """
+ Examine the the sample file to extract its metadata and compare it
+ against the file system modificaton time
+ """
+
+ logging.debug("Examining sample %s", full_file_name)
+ sample = self.sample_non_camera_metadata(dirname, name, full_file_name, ext_type)
+ if sample.datetime is not None:
+ self.file_mdatatime[full_file_name] = sample.datetime.timestamp()
+ try:
+ mtime = os.path.getmtime(full_file_name)
+ except OSError as e:
+ logging.warning("Could not determine modification time for %s",
+ full_file_name)
+ uri = get_uri(full_file_name=full_file_name)
+ self.problems.append(FsMetadataReadProblem(uri=uri, name=name, exception=e))
+ return False
+ else:
+ # Located sample file: examine
+ self.determine_device_timestamp_tz(
+ sample.datetime, mtime, sample.determined_by)
+ return True
+
+ def distinguish_non_camera_device_timestamp(self, path: str) -> None:
+ """
+ Attempt to determine the device's approach to timezones when it
+ store timestamps.
+ When determining how this device reports modification time, file
+ preference is (1) RAW, (2)jpeg, and finally least preferred is (3)
+ video -- a RAW is the least likely to be modified.
+
+ NOTE: this creates a sample file for one type of file (RAW if present,
+ if not, then jpeg, if jpeg also not present, then video). However if
+ a raw / jpeg is found, then still need to create sample file for video.
+ """
+
+ logging.debug("Distinguishing approach to timestamp time zones on %s", self.display_name)
+
+ self.device_timestamp_type = DeviceTimestampTZ.unknown
+
+ max_attempts = 10
+ raw_attempts = 0
+ jpegs_and_videos = defaultdict(deque)
+
+ for dir_name, name in self.walk_file_system(path):
+ full_file_name = os.path.join(dir_name, name)
+ ext_type = rpdfile.extension_type(os.path.splitext(full_file_name)[1].lower()[1:])
+ if ext_type in (FileExtension.raw, FileExtension.jpeg, FileExtension.video):
+ if ext_type == FileExtension.raw and raw_attempts < max_attempts:
+ # examine right away
+ raw_attempts += 1
+ if self.examine_sample_non_camera_file(dirname=dir_name, name=name,
+ full_file_name=full_file_name, ext_type=ext_type):
+ return
+ else:
+ if len(jpegs_and_videos[ext_type]) < max_attempts:
+ jpegs_and_videos[ext_type].append((dir_name, name, full_file_name))
+
+ if len(jpegs_and_videos[FileExtension.jpeg]) == max_attempts:
+ break
+
+ # Couldn't locate sample raw file. Are left with up to max_attempts jpeg and video files
+ for ext_type in (FileExtension.jpeg, FileExtension.video):
+ for dir_name, name, full_file_name in jpegs_and_videos[ext_type]:
+ if self.examine_sample_non_camera_file(dirname=dir_name, name=name,
+ full_file_name=full_file_name, ext_type=ext_type):
+ return
+
+ def determine_device_timestamp_tz(self, mdatatime: datetime,
+ modification_time: Union[int, float],
+ determined_by: str) -> None:
+ """
+ Compare metadata time with file modification time in an attempt
+ to determine the device's approach to timezones when it stores timestamps.
+
+ :param mdatatime: file's metadata time
+ :param modification_time: file's file system modification time
+ :param determined_by: simple string used in log messages
+ """
+
+ if mdatatime is None:
+ logging.debug("Could not determine Device timezone setting for %s",
+ self.display_name)
+ self.device_timestamp_type = DeviceTimestampTZ.unknown
+
+ # Must not compare exact times, as there can be a few seconds difference between
+ # when a file was saved to the flash memory and when it was created in the
+ # camera's memory. Allow for two minutes, to be safe.
+ if datetime_roughly_equal(dt1=datetime.utcfromtimestamp(modification_time),
+ dt2=mdatatime):
+ logging.info("Device timezone setting for %s is UTC, as indicated by %s file",
+ self.display_name, determined_by)
+ self.device_timestamp_type = DeviceTimestampTZ.is_utc
+ elif datetime_roughly_equal(dt1=datetime.fromtimestamp(modification_time),
+ dt2=mdatatime):
+ logging.info("Device timezone setting for %s is local time, as indicated by "
+ "%s file", self.display_name, determined_by)
+ self.device_timestamp_type = DeviceTimestampTZ.is_local
+ else:
+ logging.info("Device timezone setting for %s is unknown, because the file "
+ "modification time and file's time as recorded in metadata differ for "
+ "sample file %s",
+ self.display_name, determined_by)
+ self.device_timestamp_type = DeviceTimestampTZ.unknown
+
+ def adjusted_mtime(self, mtime: float) -> float:
+ """
+ Use the same calculated mtime that will be applied when the mtime
+ is saved in the rpd_file
+
+ :param mtime: raw modification time
+ :return: modification time adjusted, if needed
+ """
+
+ if self.device_timestamp_type == DeviceTimestampTZ.is_utc:
+ return datetime.utcfromtimestamp(mtime).timestamp()
+ else:
+ return mtime
+
+ def _get_associate_file_from_camera(self, base_name: str,
+ associate_files: defaultdict, camera_file: CameraFile) -> Optional[str]:
+ for path, ext in associate_files[base_name]:
+ if path in self._camera_directories_for_file[camera_file]:
+ return '{}.{}'.format(os.path.join(path, base_name),ext)
+ return None
+
+ def get_video_THM_file(self, base_name: str, camera_file: CameraFile) -> Optional[str]:
+ """
+ Checks to see if a thumbnail file (THM) with the same base name
+ is in the same directory as the file.
+
+ :param base_name: the file name without the extension
+ :return: filename, including path, if found, else returns None
+ """
+
+ if self.download_from_camera:
+ return self._get_associate_file_from_camera(base_name,
+ self._camera_video_thumbnails, camera_file)
+ else:
+ return self._get_associate_file(base_name, rpdfile.VIDEO_THUMBNAIL_EXTENSIONS)
+
+ def get_audio_file(self, base_name: str, camera_file: CameraFile) -> Optional[str]:
+ """
+ Checks to see if an audio file with the same base name
+ is in the same directory as the file.
+
+ :param base_name: the file name without the extension
+ :return: filename, including path, if found, else returns None
+ """
+
+ if self.download_from_camera:
+ return self._get_associate_file_from_camera(
+ base_name, self._camera_audio_files, camera_file
+ )
+ else:
+ return self._get_associate_file(base_name, rpdfile.AUDIO_EXTENSIONS)
+
+ def get_xmp_file(self, base_name: str, camera_file: CameraFile) -> Optional[str]:
+ """
+ Checks to see if an XMP file with the same base name
+ is in the same directory as tthe file.
+
+ :param base_name: the file name without the extension
+ :return: filename, including path, if found, else returns None
+ """
+ if self.download_from_camera:
+ return self._get_associate_file_from_camera(
+ base_name, self._camera_xmp_files, camera_file
+ )
+ else:
+ return self._get_associate_file(base_name, ['XMP'])
+
+ def _get_associate_file(self, base_name: str, extensions_to_check: List[str]) -> Optional[str]:
+ full_file_name_no_ext = os.path.join(self.dir_name, base_name)
+ for e in extensions_to_check:
+ possible_file = '{}.{}'.format(full_file_name_no_ext, e)
+ if os.path.exists(possible_file):
+ return possible_file
+ possible_file = '{}.{}'.format(full_file_name_no_ext, e.upper())
+ if os.path.exists(possible_file):
+ return possible_file
+ return None
+
+ def cleanup_pre_stop(self):
+ if self.camera is not None:
+ self.camera.free_camera()
+ self.send_problems()
+
+ @property
+ def camera_details(self) -> Optional[CameraDetails]:
+ return self._camera_details
+
+ @camera_details.setter
+ def camera_details(self, index: Optional[int]) -> None:
+ """
+ :param index: index into the storage details, for cameras with more than one
+ storage
+ """
+
+ if not self.camera_storage_descriptions:
+ self.camera_storage_descriptions = self.camera.get_storage_descriptions()
+
+ if not self.camera_storage_descriptions:
+ # Problem: there are no descriptions for the storage
+ self._camera_details = CameraDetails(
+ model=self.camera_model, port=self.camera_port,
+ display_name=self.camera_display_name,
+ is_mtp=self.is_mtp_device, storage_desc=[]
+ )
+ return
+
+ index = index or 0
+
+ self._camera_details = CameraDetails(
+ model=self.camera_model, port=self.camera_port, display_name=self.camera_display_name,
+ is_mtp=self.is_mtp_device, storage_desc=self.camera_storage_descriptions[index]
+ )
+
+
+def trace_lines(frame, event, arg):
+ if event != 'line':
+ return
+ co = frame.f_code
+ func_name = co.co_name
+ line_no = frame.f_lineno
+ print('%s >>>>>>>>>>>>> At %s line %s' % (datetime.now().ctime(), func_name, line_no))
+
+def trace_calls(frame, event, arg):
+ if event != 'call':
+ return
+ co = frame.f_code
+ func_name = co.co_name
+ if func_name in ('write', '__getattribute__'):
+ return
+ func_line_no = frame.f_lineno
+ func_filename = co.co_filename
+ caller = frame.f_back
+ if caller is not None:
+ caller_line_no = caller.f_lineno
+ caller_filename = caller.f_code.co_filename
+ else:
+ caller_line_no = caller_filename = ''
+ print('% s Call to %s on line %s of %s from line %s of %s' %
+ (datetime.now().ctime(), func_name, func_line_no, func_filename, caller_line_no,
+ caller_filename))
+
+ for f in ('distingish_non_camera_device_timestamp','determine_device_timestamp_tz'):
+ if func_name.find(f) >= 0:
+ # Trace into this function
+ return trace_lines
+
+if __name__ == "__main__":
+ if os.getenv('RPD_SCAN_DEBUG') is not None:
+ sys.settrace(trace_calls)
+ scan = ScanWorker()
+
+