PDForensic (version 0.2.0)
index
PDForensic.py

This tool analyses PDF files for Forensic Investigations
 
~# cat blank.pdf | python3.11 PDForensic.py - *.pdf ../*.pdf https://www.pdfscripting.com/public/FreeStuff/PDFSamples/TheFlyv3_EN4Rdr.pdf
...
~# python3.11 PDForensic.py blank.pdf
0         pdf_tag                   b'%PDF-1.6\r'
1         type                      ObjStm
2         type                      ObjStm
3         type                      XRef
4         type                      Outlines
5         subtype                   Type1
5         type                      Font
7         type                      Pages
9         subtype                   XML
9         type                      Metadata
10        date                      b"D:20060216150351-08'00'"
10        date                      b"D:20080816125100-07'00'"
15        date                      b"D:20080816125100-07'00'"
15        type                      TransformParams
15        type                      SigRef
15        type                      Sig
15        acroform                  b'/AcroForm 21 0 R/'
15        type                      Catalog
17        scripts                   b'/JavaScript'
19        scripts                   b'/JavaScript/JS'
21        type                      OCG
22        type                      Page
23        subtype                   Widget
23        type                      Annot
24        subtype                   Form
24        type                      XObject
25        subtype                   Form
25        type                      XObject
27        subtype                   Image
27        type                      XObject
28        subtype                   Image
28        type                      XObject
29        subtype                   Widget
29        type                      Annot
30        subtype                   Form
30        type                      XObject
31        subtype                   Form
31        type                      XObject
32        subtype                   Form
32        type                      XObject
33        subtype                   Form
33        type                      XObject
34        subtype                   Widget
34        type                      Annot
35        subtype                   Form
35        type                      XObject
36        subtype                   Form
36        type                      XObject
37        subtype                   Form
37        type                      XObject
38        subtype                   Form
38        type                      XObject
39        subtype                   Widget
39        type                      Annot
40        subtype                   Form
40        type                      XObject
41        subtype                   Form
41        type                      XObject
42        subtype                   Form
42        type                      XObject
43        subtype                   Form
43        type                      XObject
44        subtype                   Widget
44        type                      Annot
45        subtype                   Form
45        type                      XObject
46        subtype                   Type1
46        type                      Font
47        type                      Encoding
48        subtype                   Link
48        type                      Border
48        type                      Annot
49        subtype                   Widget
49        type                      Annot
50        subtype                   Form
50        type                      XObject
51        subtype                   Type1
51        type                      Font
52        subtype                   Widget
52        type                      Annot
53        subtype                   Form
53        type                      XObject
54        subtype                   Form
54        type                      XObject
55        subtype                   Widget
55        type                      Annot
56        subtype                   Form
56        type                      XObject
57        subtype                   Form
57        type                      XObject
58        subtype                   Widget
58        type                      Annot
59        subtype                   Form
59        type                      XObject
60        subtype                   Widget
60        type                      Annot
61        subtype                   Form
61        type                      XObject
62        subtype                   Form
62        type                      XObject
63        subtype                   Image
63        type                      XObject
64        subtype                   Widget
64        type                      Annot
65        subtype                   Form
65        type                      XObject
66        subtype                   Form
66        type                      XObject
67        subtype                   Form
67        type                      XObject
68        subtype                   Popup
68        type                      Annot
69        subtype                   Widget
69        type                      Annot
70        subtype                   Form
70        type                      XObject
71        subtype                   Type1
71        type                      Font
72        type                      Encoding
74        subtype                   TrueType
74        type                      Font
75        subtype                   TrueType
75        type                      Font
76        subtype                   TrueType
76        type                      Font
77        subtype                   TrueType
77        type                      Font
78        URI                       b'/URI(http://www.pdfscripting.com)/S/URI>>'
79        scripts                   b'/JavaScript/JS(\\nif\\(this.bouncing\\)\\r\\n{\\r\\n\tthis.bouncing = false;\\r\\n\tapp.clearInterval\\(this.bounceTime\\);\\r\\n\tthis.bounceTime = null;\\r\\n}\\r\\n\\r\\n//app.clearInterval\\(timer\\); // stop timer\\r\\n//app.clearTimeOut\\(timeout\\); // stop timer\\r\\n\\r\\n\\r\\n\\r)>>'
81        scripts                   b'/JavaScript/JS'
83        scripts                   b'/JavaScript/JS(\\nXinc = 5;\\r\\nYinc = 5;\\r\\n\\r\\n\\r)>>'
84        scripts                   b'/JavaScript/JS(\\nXinc = 3;\\r\\nYinc = 3;\\r\\n\\r\\n\\r)>>'
85        scripts                   b'/JavaScript/JS(\\nXinc = 1;\\r\\nYinc = 1;\\r\\n\\r)>>'
86        scripts                   b'/JavaScript/JS'
89        subtype                   Image
89        type                      XObject
91        type                      FontDescriptor
93        type                      FontDescriptor
94        type                      FontDescriptor
95        type                      ExtGState
96        type                      ExtGState
97        URI                       b'/URI(http://www.windjack.com)/S/URI>>'
98        xref                      
99        xref                      
100       startxref                 
101       eof_tag                   b'%%EOF\r'
{
    "tool": "PDForensic",
    "version": "0.0.1",
    "file": "<http.client.HTTPResponse object at 0x7f4cbbc6fd60>",
    "date": "2022-12-27T18:53:59.873367",
    "malicious": {
        "score": "26%",
        "types": [
            "acroform",
            "scripts",
            "URI"
        ]
    },
    "objects": {
        "found": 102,
        "processed": 146,
        "counter": {
            "type - XObject": 31,
            "subtype - Form": 27,
            "type - Annot": 14,
            "subtype - Widget": 12,
            "type - Font": 8,
            "subtype - Type1": 4,
            "subtype - Image": 4,
            "subtype - TrueType": 4,
            "type - FontDescriptor": 3,
            "type - ObjStm": 2,
            "type - Encoding": 2,
            "type - ExtGState": 2,
            "type - XRef": 1,
            "type - Outlines": 1,
            "type - Pages": 1,
            "subtype - XML": 1,
            "type - Metadata": 1,
            "type - TransformParams": 1,
            "type - SigRef": 1,
            "type - Sig": 1,
            "type - Catalog": 1,
            "type - OCG": 1,
            "type - Page": 1,
            "subtype - Link": 1,
            "type - Border": 1,
            "subtype - Popup": 1
        }
    },
    "filters": {
        "ids": [],
        "types": [],
        "regex": [],
        "strings": [],
        "raw data - hexadecimal": []
    }
}
~# python3.11 PDForensic.py objstm.pdf --data --types objstm --no-csv --no-json
0         pdf_tag                   b'%PDF-1.5\n'
1         object                    b'1 0 obj\n<< /Type /ObjStm /Length 236 /N 4 /First 20 >>\nstream\n2 0 3 34 4 78 5 143\n<< /Pages 3 0 R /Type /Catalog >>\n<< /Count 1 /Kids [ 4 0 R ] /Type /Pages >>\n<< /Contents 6 0 R /Parent 3 0 R /Resources 5 0 R /Type /Page >>\n<< /Font << /F1 << /BaseFont /Arial /Subtype /Type1 /Type /Font >> >> >>\nendstream\nendobj'
4         startxref                 
5         eof_tag                   b'%%EOF\n'
{
    "tool": "PDForensic",
    "version": "0.0.1",
    "file": "objstm.pdf",
    "date": "2022-12-27T19:42:13.226314",
    "malicious": {
        "score": "0%",
        "types": []
    },
    "objects": {
        "found": 6,
        "processed": 4,
        "counter": {
            "type - ObjStm ": 1,
            "type - XRef ": 1
        }
    },
    "filters": {
        "ids": [],
        "types": [
            "objstm"
        ],
        "regex": [],
        "strings": [],
        "raw data - hexadecimal": []
    }
}
~# python3.11 PDForensic.py https://www.pdfscripting.com/public/FreeStuff/PDFSamples/TheFlyv3_EN4Rdr.pdf --data --ids 79 83 --ids 84 --strings URI --no-csv --no-json
0         pdf_tag                   b'%PDF-1.6\r'
78        object                    b'87 0 obj\r<</URI(http://www.pdfscripting.com)/S/URI>>\rendobj'
79        object                    b'89 0 obj\r<</S/JavaScript/JS(\\nif\\(this.bouncing\\)\\r\\n{\\r\\n\tthis.bouncing = false;\\r\\n\tapp.clearInterval\\(this.bounceTime\\);\\r\\n\tthis.bounceTime = null;\\r\\n}\\r\\n\\r\\n//app.clearInterval\\(timer\\); // stop timer\\r\\n//app.clearTimeOut\\(timeout\\); // stop timer\\r\\n\\r\\n\\r\\n\\r)>>\rendobj'
83        object                    b'94 0 obj\r<</S/JavaScript/JS(\\nXinc = 5;\\r\\nYinc = 5;\\r\\n\\r\\n\\r)>>\rendobj'
84        object                    b'95 0 obj\r<</S/JavaScript/JS(\\nXinc = 3;\\r\\nYinc = 3;\\r\\n\\r\\n\\r)>>\rendobj'
97        object                    b'108 0 obj\r<</URI(http://www.windjack.com)/S/URI>>\rendobj'
98        xref                      
99        xref                      
100       startxref                 
101       eof_tag                   b'%%EOF\r'
{
    "tool": "PDForensic",
    "version": "0.0.1",
    "file": "<http.client.HTTPResponse object at 0x7fd5329a4760>",
    "date": "2022-12-27T19:44:38.964000",
    "malicious": {
        "score": "26%",
        "types": [
            "acroform",
            "scripts",
            "URI"
        ]
    },
    "objects": {
        "found": 102,
        "processed": 10,
        "counter": {
            "type - XObject": 31,
            "subtype - Form": 27,
            "type - Annot": 14,
            "subtype - Widget": 12,
            "type - Font": 8,
            "subtype - Type1": 4,
            "subtype - Image": 4,
            "subtype - TrueType": 4,
            "type - FontDescriptor": 3,
            "type - ObjStm": 2,
            "type - Encoding": 2,
            "type - ExtGState": 2,
            "type - XRef": 1,
            "type - Outlines": 1,
            "type - Pages": 1,
            "subtype - XML": 1,
            "type - Metadata": 1,
            "type - TransformParams": 1,
            "type - SigRef": 1,
            "type - Sig": 1,
            "type - Catalog": 1,
            "type - OCG": 1,
            "type - Page": 1,
            "subtype - Link": 1,
            "type - Border": 1,
            "subtype - Popup": 1
        }
    },
    "filters": {
        "ids": [
            83,
            84,
            79
        ],
        "types": [],
        "regex": [],
        "strings": [
            "URI"
        ],
        "raw data - hexadecimal": []
    }
}
~# python3.11 PDForensic.py objstm.pdf --data --logs 20 --regex '[0-9a-f]{32}' --no-csv --no-json
0         pdf_tag                   b'%PDF-1.5\n'
[2022-12-27 19:54:14] INFO     (20) {PDForensic - PDForensic.py:634} Object 3 match the 'regex' filter.
3         object                    b'7 0 obj\n<< /Type /XRef /Length 32 /W [ 1 2 1 ] /Root 2 0 R /Size 8 /ID [<98e68406a8333cc2a3429ac0e8aa1fed><05fa7af561f775eeb73f00cd09fe19e7>] >>\nstream\n\x00\x00\x00\x00\x01\x00\x0f\x00\x02\x00\x01\x00\x02\x00\x01\x01\x02\x00\x01\x02\x02\x00\x01\x03\x01\x01J\x00\x01\x01\xb4\x00\nendstream\nendobj'
4         startxref                 
5         eof_tag                   b'%%EOF\n'
{
    "tool": "PDForensic",
    "version": "0.0.1",
    "file": "objstm.pdf",
    "date": "2022-12-27T19:54:14.196113",
    "malicious": {
        "score": "0%",
        "types": []
    },
    "objects": {
        "found": 6,
        "processed": 4,
        "counter": {
            "type - ObjStm ": 1,
            "type - XRef ": 1
        }
    },
    "filters": {
        "ids": [],
        "types": [],
        "strings": [],
        "regex": [
            "[0-9a-f]{32}"
        ],
        "raw data - hexadecimal": []
    }
}
~# python3.11 PDForensic.py objstm.pdf --data --hexa 000102
0         pdf_tag                   b'%PDF-1.5\n'
3         object                    b'7 0 obj\n<< /Type /XRef /Length 32 /W [ 1 2 1 ] /Root 2 0 R /Size 8 /ID [<98e68406a8333cc2a3429ac0e8aa1fed><05fa7af561f775eeb73f00cd09fe19e7>] >>\nstream\n\x00\x00\x00\x00\x01\x00\x0f\x00\x02\x00\x01\x00\x02\x00\x01\x01\x02\x00\x01\x02\x02\x00\x01\x03\x01\x01J\x00\x01\x01\xb4\x00\nendstream\nendobj'
4         startxref                 
5         eof_tag                   b'%%EOF\n'
{
    "tool": "PDForensic",
    "version": "0.0.1",
    "file": "objstm.pdf",
    "date": "2022-12-27T20:05:19.538251",
    "malicious": {
        "score": "0%",
        "types": []
    },
    "objects": {
        "found": 6,
        "processed": 4,
        "counter": {
            "type - ObjStm ": 1,
            "type - XRef ": 1
        }
    },
    "filters": {
        "ids": [],
        "types": [],
        "strings": [],
        "regex": [],
        "raw data - hexadecimal": [
            "000102"
        ]
    }
}
~# 
 
>>> from PDForensic import PDForensic
>>> class MyPDFparser(PDForensic):
...     def __init__(self):
...         super().__init__("objstm.pdf")
...     def handle(self, type_: str, data: bytes, typename: str = "") -> None:
...         print(type_, data, typename)
>>> parser = MyPDFparser()
>>> parser.parse()
pdf_tag b'%PDF-1.5\n' 
stream_object b'< /Type /ObjStm /' 
type b'/Type /XRef ' XRef 
startxref b'startxref\n436\n' 
eof_tag b'%%EOF\n' 
>>> print(parser.report())
{'tool': 'PDForensic', 'version': '0.0.1', 'file': 'objstm.pdf', 'date': '2022-12-27T20:26:27.425086', 'malicious': {'score': '10%', 'types': ['stream_object']}, 'objects': {'found': 6, 'processed': 5, 'counter': {'type - XRef ': 1}}, 'filters': {'ids': [], 'types': [], 'strings': [], 'regex': [], 'raw data - hexadecimal': []}}
>>> class MyPDFparser(PDForensic):
...     def __init__(self):
...         super().__init__("objstm.pdf", process_data = True, process_tags = False, filter_ = True, strings = ["/Pages"], hexa = ["000102"], regexs = ['[0-9a-f]{32}'], types = ["xref"], ids = [2])
...     def handle(self, type_: str, data: bytes, typename: str = "") -> None:
...         print(type_, data, typename)
>>> parser = MyPDFparser()
>>> parser.parse()
pdf_tag b'%PDF-1.5\n' 
object b'1 0 obj\n<< /Type /ObjStm /Length 236 /N 4 /First 20 >>\nstream\n2 0 3 34 4 78 5 143\n<< /Pages 3 0 R /Type /Catalog >>\n<< /Count 1 /Kids [ 4 0 R ] /Type /Pages >>\n<< /Contents 6 0 R /Parent 3 0 R /Resources 5 0 R /Type /Page >>\n<< /Font << /F1 << /BaseFont /Arial /Subtype /Type1 /Type /Font >> >> >>\nendstream\nendobj' 
object b"6 0 obj\n<< /Length 57 >>\nstream\nq\nBT\n/F1 55 Tf\n10 400 Td\n(http://www.corkami.com) '\nET\nQ\nendstream\nendobj" 
object b'7 0 obj\n<< /Type /XRef /Length 32 /W [ 1 2 1 ] /Root 2 0 R /Size 8 /ID [<98e68406a8333cc2a3429ac0e8aa1fed><05fa7af561f775eeb73f00cd09fe19e7>] >>\nstream\n\x00\x00\x00\x00\x01\x00\x0f\x00\x02\x00\x01\x00\x02\x00\x01\x01\x02\x00\x01\x02\x02\x00\x01\x03\x01\x01J\x00\x01\x01\xb4\x00\nendstream\nendobj' 
startxref b'startxref\n436\n' 
eof_tag b'%%EOF\n' 
>>> print(parser.report())
{'tool': 'PDForensic', 'version': '0.0.1', 'file': 'objstm.pdf', 'date': '2022-12-27T20:38:38.078297', 'malicious': {'score': '10%', 'types': ['stream_object']}, 'objects': {'found': 6, 'processed': 6, 'counter': {'type - XRef ': 1}}, 'filters': {'ids': [2], 'types': ['xref'], 'strings': ['/Pages'], 'regex': ['[0-9a-f]{32}'], 'raw data - hexadecimal': ['000102']}}
>>>

 
Classes
       
abc.ABC(builtins.object)
PDForensic

 
class PDForensic(abc.ABC)
    PDForensic(file: str, process_data: bool = False, process_tags: bool = True, filter_: bool = True, strings: Iterable[str] = [], hexa: Iterable[str] = [], regexs: Iterable[str] = [], types: Iterable[str] = [], ids: Iterable[int] = [])
 
This class parses and analyses PDF files for Forensic Investigations.
 
 
Method resolution order:
PDForensic
abc.ABC
builtins.object

Methods defined here:
__init__(self, file: str, process_data: bool = False, process_tags: bool = True, filter_: bool = True, strings: Iterable[str] = [], hexa: Iterable[str] = [], regexs: Iterable[str] = [], types: Iterable[str] = [], ids: Iterable[int] = [])
Initialize self.  See help(type(self)) for accurate signature.
filter(self, data: bytes, decoded_data: bytes = None) -> bool
This function filters objects.
get_data_process(self, match: re.Match) -> bool
This function sends only data to process to filters.
get_malicious_score(self) -> float
This function calculates malicious score.
handle(self, type_: str, data: bytes, typename: str = '') -> None
parse(self) -> None
This function parses PDF data.
pdf_unfilter(self, tags: bytes, full_data: bytes) -> bytes
This function decodes and decompress PDF streams.
read_file(self) -> bytes
This function returns data readed from PDF file.
report(self) -> Dict[str, Union[str, int]]
This function reports PDF analysis.
to_handle(self, type_: str, data: bytes, typename: str = '') -> None
This function calls inherited 'handle_object' methods.
type_filter(self, type_: str, data: bytes, processed: bool = None) -> bool
This function filters objects by type.

Static methods defined here:
deobfuscation(tags: bytes) -> bytes
This function deobfuscates tags.
 
>>> PDForensic.deobfuscation(r'#61(\142)#63'.encode())
b'a(b)c'
>>> PDForensic.deobfuscation(r')'.encode())
[2016-06-22 17:58:15] ERROR    (40) {PDForensic - PDForensic.py:722} PDF syntax error
>>>

Data descriptors defined here:
__dict__
dictionary for instance variables (if defined)
__weakref__
list of weak references to the object (if defined)

Data and other attributes defined here:
__abstractmethods__ = frozenset({'handle'})
__annotations__ = {'malicious_scoring': typing.Dict[str, int]}
filters = {'A85': <function a85decode>, 'AHx': <function hex_decode>, 'ASCII85Decode': <function a85decode>, 'ASCIIHexDecode': <function hex_decode>, 'Fl': <function deflate>, 'FlateDecode': <function deflate>, 'LZW': <bound method LzwDecode.decode of <PDForensic.LzwDecode object>>, 'LZWDecode': <bound method LzwDecode.decode of <PDForensic.LzwDecode object>>, 'R': <function runlength_decode>, 'RunLengthDecode': <function runlength_decode>}
malicious_scoring = {'AA_script_starter': 75, 'GoTo': 15, 'OpenAction_script_starter': 75, 'URI': 25, 'acroform': 15, 'command': 100, 'embedded': 25, 'form': 25, 'malicious_image': 10, 'media': 10, ...}

 
Data
        __all__ = ['PDForensic']
__annotations__ = {'logger': <class 'logging.Logger'>, 'logger_critical': <class 'collections.abc.Callable'>, 'logger_debug': <class 'collections.abc.Callable'>, 'logger_error': <class 'collections.abc.Callable'>, 'logger_info': <class 'collections.abc.Callable'>, 'logger_log': <class 'collections.abc.Callable'>, 'logger_warning': <class 'collections.abc.Callable'>, 'pdf_filters': <class 're.Pattern'>, 'pdf_parser': <class 're.Pattern'>, 'pdf_streams': <class 're.Pattern'>, ...}
__author_email__ = 'mauricelambert434@gmail.com'
__copyright__ = '\nPDForensic Copyright (C) 2022, 2023 Maurice L...ome to redistribute it\nunder certain conditions.\n'
__description__ = 'This tool analyses PDF files for Forensic Investigations'
__license__ = 'GPL-3.0 License'
__maintainer__ = 'Maurice Lambert'
__maintainer_email__ = 'mauricelambert434@gmail.com'
__url__ = 'https://github.com/mauricelambert/PDForensic'

 
Author
        Maurice Lambert