Collects files using a set of globs. All globs must be on the same device. The globs will be searched in one pass - so you can provide many globs at the same time.
name: Generic.Collectors.File
description: |
Collects files using a set of globs. All globs must be on the same
device. The globs will be searched in one pass - so you can provide
many globs at the same time.
aliases:
- Windows.Collectors.File
parameters:
- name: collectionSpec
description: |
A CSV file with a Glob column with all the globs to collect.
NOTE: Globs must not have a leading device.
type: csv
default: |
Glob
Users\*\NTUser.dat
- name: Root
description: |
On Windows, this is the device to apply all the glob on
(e.g. `C:`). On *NIX, this should be a path to a subdirectory or
/.
default: "C:"
- name: Accessor
default: auto
description: |
On Windows, this can be changed to `ntfs`.
- name: NTFS_CACHE_TIME
type: int
description: How often to flush the NTFS cache. (Default is never).
default: "1000000"
- name: MaxFileSize
type: int
default: 18446744073709551615
description: |
The max size in bytes of the individual files to collect.
Set to 0 to disable it.
sources:
- name: All Matches Metadata
query: |
LET RootPath <= pathspec(Path=Root, accessor=Accessor)
-- Generate the collection globs for each device
LET specs = SELECT RootPath + Glob AS Glob
FROM collectionSpec
WHERE log(message=format(format="Processing Device %v with %v: glob is %v",
args=[Root, Accessor, Glob]))
-- Join all the collection rules into a single Glob plugin. This ensure we
-- only make one pass over the filesystem. We only want LFNs.
LET hits = SELECT OSPath AS SourceFile,
Size,
Btime AS Created,
Ctime AS Changed,
Mtime AS Modified,
Atime AS LastAccessed
FROM glob(globs=specs.Glob, accessor=Accessor)
WHERE NOT IsDir
AND log(message="Found " + SourceFile)
AND ( Size <= MaxFileSize OR
( log(message="Skipping file " + SourceFile + " Due to MaxFileSize")
AND FALSE ))
-- Pass all the results to the next query. This will serialize
-- to disk if there are too many results.
LET all_results <= SELECT Created,
Changed,
LastAccessed,
Modified,
Size,
SourceFile
FROM hits
SELECT *
FROM all_results
- name: Uploads
query: |
-- Upload the files. Split into workers so the files are uploaded in parallel.
LET uploaded_files = SELECT *
FROM foreach(row={
SELECT *
FROM all_results
},
workers=30,
query={
SELECT Created,
Changed,
LastAccessed,
Modified,
SourceFile,
Size,
upload(file=SourceFile, accessor=Accessor, mtime=Modified) AS Upload
FROM scope()
})
-- Separate the hashes into their own column.
SELECT now() AS CopiedOnTimestamp,
SourceFile,
Upload.Path AS DestinationFile,
Size AS FileSize,
Upload.sha256 AS SourceFileSha256,
Created,
Changed,
Modified,
LastAccessed
FROM uploaded_files