Generic.Collectors.File

Collects files using a set of globs. All globs must be on the same device. The globs will be searched in one pass - so you can provide many globs at the same time.


name: Generic.Collectors.File
description: |
   Collects files using a set of globs. All globs must be on the same
   device. The globs will be searched in one pass - so you can provide
   many globs at the same time.

aliases:
  - Windows.Collectors.File

parameters:
  - name: collectionSpec
    description: |
       A CSV file with a Glob column with all the globs to collect.
       NOTE: Globs must not have a leading device.
    type: csv
    default: |
       Glob
       Users\*\NTUser.dat

  - name: Root
    description: |
      On Windows, this is the device to apply all the glob on
      (e.g. `C:`). On *NIX, this should be a path to a subdirectory or
      /.
    default: "C:"

  - name: Accessor
    default: auto
    description: |
      On Windows, this can be changed to `ntfs`.

  - name: NTFS_CACHE_TIME
    type: int
    description: How often to flush the NTFS cache. (Default is never).
    default: "1000000"


sources:
   - name: All Matches Metadata
     query: |
      LET RootPath <= pathspec(Path=Root, accessor=Accessor)

      -- Generate the collection globs for each device
      LET specs = SELECT RootPath + Glob AS Glob
            FROM collectionSpec
            WHERE log(message=format(
               format="Processing Device %v with %v: glob is %v",
               args=[Root, Accessor, Glob]))

      -- Join all the collection rules into a single Glob plugin. This ensure we
      -- only make one pass over the filesystem. We only want LFNs.
      LET hits = SELECT OSPath AS SourceFile, Size,
               Btime AS Created,
               Ctime AS Changed,
               Mtime AS Modified,
               Atime AS LastAccessed
        FROM glob(globs=specs.Glob, accessor=Accessor)
        WHERE NOT IsDir AND log(message="Found " + SourceFile)

      -- Pass all the results to the next query. This will serialize
      -- to disk if there are too many results.
      LET all_results <=
         SELECT Created, Changed, LastAccessed, Modified, Size, SourceFile
         FROM hits

      SELECT * FROM all_results

   - name: Uploads
     query: |
      -- Upload the files
      LET uploaded_files = SELECT * FROM foreach(row={
          SELECT * FROM all_results
        },
        workers=30,
        query={
          SELECT Created, Changed, LastAccessed, Modified, SourceFile, Size,
               upload(file=SourceFile,
                      accessor=Accessor,
                      mtime=Modified) AS Upload
            FROM scope()
        })

      -- Separate the hashes into their own column.
      SELECT now() AS CopiedOnTimestamp, SourceFile,
             Upload.Path AS DestinationFile,
               Size AS FileSize, Upload.sha256 AS SourceFileSha256,
               Created, Changed, Modified, LastAccessed
        FROM uploaded_files