PySpark Plaso
Release 2019
A tool for distributed extraction of timestamps from various files using extractors adapted from the Plaso engine to Apache Spark.
|
Public Member Functions | |
def | open_filesystem (self, hdfs_uri, user="hadoop") |
def | close_filesystem (self, filesystem=None) |
def | get_filesystem (self, force_filesystem=None) |
def | exists (self, path_string, filesystem=None) |
def | info (self, path_string, filesystem=None) |
def | remove (self, path_string, recursive=True, filesystem=None) |
def | mkdir (self, path_string, filesystem=None) |
def | list_files (self, path_string, recursion_level=0, include_dir_names=False, filesystem=None) |
def | open_inputstream (self, path, filesystem=None) |
def | open_outputstream (self, path, filesystem=None) |
def | close_stream (self, input_stream) |
def | get_stream_offset (self, input_stream) |
def | get_path_size (self, path, filesystem=None) |
def | read_inputstream (self, input_stream, size=None) |
def | write_outputstream (self, output_stream, data) |
def | pass_to_outputstream (self, output_stream, input_stream) |
def | seek_stream (self, stream, offset, whence=os.SEEK_SET) |
![]() | |
def | make_uri (self, filesystem=None) |
def | make_path (self, path_string, qualified=True, filesystem=None) |
def | make_simple_path (self, path_string, filesystem=None) |
def | make_qualified_path (self, path_string, filesystem=None) |
def | append_to_path (self, original_path, new_child_string) |
def | basename (self, path_string) |
def | dirname (self, path_string) |
Static Public Attributes | |
fs = None | |
![]() | |
string | PATH_SEPARATOR = '/' |
string | SCHEME = 'hdfs' |
Additional Inherited Members | |
![]() | |
def | parse_uri (hdfs_uri) |
HDFS driver utilizing PyArrow library.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.close_filesystem | ( | self, | |
filesystem = None |
|||
) |
Close a given HDFS filesystem. :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.close_stream | ( | self, | |
input_stream | |||
) |
Close a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.exists | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Check if a given HDFS path exists in the given HDFS filesystem. :param path_string: the path :param filesystem: the filesystem :return: True if the path exists in the filesystem, False otherwise
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.get_filesystem | ( | self, | |
force_filesystem = None |
|||
) |
Get a given or a default HDFS filesystem. :param force_filesystem: the given filesystem :return: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.get_path_size | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Get the size of a given HDFS path (a file) in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the size
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.get_stream_offset | ( | self, | |
input_stream | |||
) |
Get the current position (an offset) in a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream :return: the position/offset
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.info | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Get metadata of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem :return: the metadata dictionary
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.list_files | ( | self, | |
path_string, | |||
recursion_level = 0 , |
|||
include_dir_names = False , |
|||
filesystem = None |
|||
) |
Get a list of files (optionally recursively to the specified level) in a given HDFS path of a given filesystem. :param path_string: the path :param recursion_level: the level for the recursion (0 is just a current directory without any recursion) :param include_dir_names: True to include also names of directories (suffixed by /) :param filesystem: the filesystem :return: the list of files in the path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.mkdir | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Make a new directory (if does not exist) of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.open_filesystem | ( | self, | |
hdfs_uri, | |||
user = "hadoop" |
|||
) |
Open HDFS filesystem of a given URI and as a given HDFS user. :param hdfs_uri: HDFS URI to open :param user: HDFS user to act as when opening :return: the opened filesystem as an object of pyarrow.HadoopFileSystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.open_inputstream | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Open and get an input-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened input-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.open_outputstream | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Open and get an output-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened output-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.pass_to_outputstream | ( | self, | |
output_stream, | |||
input_stream | |||
) |
Pass data from the input stream to the output stream to a HDFS file. :param output_stream: the output-stream of the HDFS file :param input_stream: the input-stream
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.read_inputstream | ( | self, | |
input_stream, | |||
size = None |
|||
) |
Read data form a given opened input stream for a HDFS file. :param input_stream: the input-stream :param size: the size of data to read :return: the data
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.remove | ( | self, | |
path_string, | |||
recursive = True , |
|||
filesystem = None |
|||
) |
Remove (optionally recursively) a HDFS file/directory given by its path in a given or a default filesystem. :param path_string: the path :param recursive: True to remove recursively :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.seek_stream | ( | self, | |
stream, | |||
offset, | |||
whence = os.SEEK_SET |
|||
) |
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file. :param stream: the opened stream :param offset: the position/offset :param whence: the direction
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.write_outputstream | ( | self, | |
output_stream, | |||
data | |||
) |
Write data buffer to a given opened output stream for a HDFS file. :param output_stream: the output-stream :param data: the data buffer to write :return: the number of bytes written
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
|
static |