Generic.Collectors.File
Collects files using a set of globs. All globs must be on the same device. The globs will be searched in one pass - so you can provide many globs at the same time.

name: Generic.Collectors.File
description: |
   Collects files using a set of globs. All globs must be on the same
   device. The globs will be searched in one pass - so you can provide
   many globs at the same time.

aliases:
  - Windows.Collectors.File

parameters:
  - name: collectionSpec
    description: |
       A CSV file with a Glob column with all the globs to collect.
       NOTE: Globs must not have a leading device.
    type: csv
    default: |
       Glob
       Users\*\NTUser.dat

  - name: Root
    description: |
      On Windows, this is the device to apply all the glob on
      (e.g. `C:`). On *NIX, this should be a path to a subdirectory or
      /.
    default: "C:"

  - name: Accessor
    default: auto
    description: |
      On Windows, this can be changed to `ntfs`.

  - name: NTFS_CACHE_TIME
    type: int
    description: How often to flush the NTFS cache. (Default is never).
    default: "1000000"

  - name: MaxFileSize
    type: int
    default: 18446744073709551615
    description: |
      The max size in bytes of the individual files to collect.
      Set to 0 to disable it.


sources:
   - name: All Matches Metadata
     query: |
        LET RootPath <= pathspec(Path=Root, accessor=Accessor)

        -- Generate the collection globs for each device
        LET specs = SELECT RootPath + Glob AS Glob
          FROM collectionSpec
          WHERE log(message=format(format="Processing Device %v with %v: glob is %v",
                                   args=[Root, Accessor, Glob]))

        -- Join all the collection rules into a single Glob plugin. This ensure we
        -- only make one pass over the filesystem. We only want LFNs.
        LET hits = SELECT OSPath AS SourceFile,
                          Size,
                          Btime AS Created,
                          Ctime AS Changed,
                          Mtime AS Modified,
                          Atime AS LastAccessed
          FROM glob(globs=specs.Glob, accessor=Accessor)
          WHERE NOT IsDir
           AND log(message="Found " + SourceFile)
           AND ( Size <= MaxFileSize OR
                 ( log(message="Skipping file " + SourceFile + " Due to MaxFileSize")
                   AND FALSE ))

        -- Pass all the results to the next query. This will serialize
        -- to disk if there are too many results.
        LET all_results <= SELECT Created,
                                  Changed,
                                  LastAccessed,
                                  Modified,
                                  Size,
                                  SourceFile
          FROM hits

        SELECT *
        FROM all_results


   - name: Uploads
     query: |
        -- Upload the files. Split into workers so the files are uploaded in parallel.
        LET uploaded_files = SELECT *
          FROM foreach(row={
            SELECT *
            FROM all_results
          },
                       workers=30,
                       query={
            SELECT Created,
                   Changed,
                   LastAccessed,
                   Modified,
                   SourceFile,
                   Size,
                   upload(file=SourceFile, accessor=Accessor, mtime=Modified) AS Upload
            FROM scope()
          })

        -- Separate the hashes into their own column.
        SELECT now() AS CopiedOnTimestamp,
               SourceFile,
               Upload.Path AS DestinationFile,
               Size AS FileSize,
               Upload.sha256 AS SourceFileSha256,
               Created,
               Changed,
               Modified,
               LastAccessed
        FROM uploaded_files