PySpark Plaso
Release 2019
A tool for distributed extraction of timestamps from various files using extractors adapted from the Plaso engine to Apache Spark.
|
Public Member Functions | |
def | make_uri (self, filesystem=None) |
def | open_filesystem (self, hdfs_uri, user) |
def | close_filesystem (self, filesystem) |
def | get_filesystem (self, force_filesystem) |
def | exists (self, path_string, filesystem) |
def | make_path (self, path_string, qualified=True, filesystem=None) |
def | make_simple_path (self, path_string, filesystem=None) |
def | make_qualified_path (self, path_string, filesystem=None) |
def | append_to_path (self, original_path, new_child_string) |
def | basename (self, path_string) |
def | dirname (self, path_string) |
def | info (self, path_string, filesystem=None) |
def | remove (self, path_string, recursive=True, filesystem=None) |
def | mkdir (self, path_string, filesystem=None) |
def | list_files (self, path_string, recursion_level=0, include_dir_names=False, filesystem=None) |
def | open_inputstream (self, path, filesystem) |
def | open_outputstream (self, path, filesystem=None) |
def | close_stream (self, input_stream) |
def | get_stream_offset (self, input_stream) |
def | get_path_size (self, path, filesystem) |
def | read_inputstream (self, input_stream, size) |
def | write_outputstream (self, output_stream, data) |
def | seek_stream (self, stream, offset, whence=os.SEEK_SET) |
Static Public Member Functions | |
def | parse_uri (hdfs_uri) |
Static Public Attributes | |
string | PATH_SEPARATOR = '/' |
string | SCHEME = 'hdfs' |
An abstract class for drivers to access HDFS.
def plaso.tarzan.lib.hdfs.Hdfs.append_to_path | ( | self, | |
original_path, | |||
new_child_string | |||
) |
Append a path/directory/file into another HDFS path. :param original_path: the another path :param new_child_string: the path to append :return: the resulting path
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.basename | ( | self, | |
path_string | |||
) |
Get a basename from a given HDFS path. :param path_string: the path :return: the basename
def plaso.tarzan.lib.hdfs.Hdfs.close_filesystem | ( | self, | |
filesystem | |||
) |
Close a given HDFS filesystem. :param filesystem: the filesystem
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.close_stream | ( | self, | |
input_stream | |||
) |
Close a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.dirname | ( | self, | |
path_string | |||
) |
Get a dirname from a given HDFS path. :param path_string: the path :return: the dirname
def plaso.tarzan.lib.hdfs.Hdfs.exists | ( | self, | |
path_string, | |||
filesystem | |||
) |
Check if a given HDFS path exists in the given HDFS filesystem. :param path_string: the path :param filesystem: the filesystem :return: True if the path exists in the filesystem, False otherwise
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.get_filesystem | ( | self, | |
force_filesystem | |||
) |
Get a given or a default HDFS filesystem. :param force_filesystem: the given filesystem :return: the filesystem
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.get_path_size | ( | self, | |
path, | |||
filesystem | |||
) |
Get the size of a given HDFS path (a file) in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the size
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.get_stream_offset | ( | self, | |
input_stream | |||
) |
Get the current position (an offset) in a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream :return: the position/offset
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.info | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Get metadata of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem :return: the metadata dictionary
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.list_files | ( | self, | |
path_string, | |||
recursion_level = 0 , |
|||
include_dir_names = False , |
|||
filesystem = None |
|||
) |
Get a list of files (optionally recursively to the specified level) in a given HDFS path of a given filesystem. :param path_string: the path :param recursion_level: the level for the recursion (0 is just a current directory without any recursion) :param include_dir_names: True to include also names of directories (suffixed by /) :param filesystem: the filesystem :return: the list of files in the path
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.make_path | ( | self, | |
path_string, | |||
qualified = True , |
|||
filesystem = None |
|||
) |
Get a (qualified) HDFS URI from a given path and a given or a default filesystem. :param path_string: the path :param qualified: True to get the qualified path :param filesystem: the filesystem :return: the resulting path
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.make_qualified_path | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Get a qualified HDFS URI from a given path and a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem :return: the resulting path
def plaso.tarzan.lib.hdfs.Hdfs.make_simple_path | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Get an unqualified HDFS URI from a given path and a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem :return: the resulting path
def plaso.tarzan.lib.hdfs.Hdfs.make_uri | ( | self, | |
filesystem = None |
|||
) |
Get HDFS URI of a given or a default filesystem. :param filesystem: the filesystem :return: the HDFS URI
def plaso.tarzan.lib.hdfs.Hdfs.mkdir | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Make a new directory (if does not exist) of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.open_filesystem | ( | self, | |
hdfs_uri, | |||
user | |||
) |
Open HDFS filesystem of a given URI and as a given HDFS user. :param hdfs_uri: HDFS URI to open :param user: HDFS user to act as when opening :return: the opened filesystem
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.open_inputstream | ( | self, | |
path, | |||
filesystem | |||
) |
Open and get an input-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened input-stream
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.open_outputstream | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Open and get an output-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened output-stream
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
|
static |
Parse HDFS URI into individual components. :param hdfs_uri: HDFS URI of the file to parse :return: an object that provide individual component of the URI
def plaso.tarzan.lib.hdfs.Hdfs.read_inputstream | ( | self, | |
input_stream, | |||
size | |||
) |
Read data form a given opened input stream for a HDFS file. :param input_stream: the input-stream :param size: the size of data to read :return: the data
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.remove | ( | self, | |
path_string, | |||
recursive = True , |
|||
filesystem = None |
|||
) |
Remove (optionally recursively) a HDFS file/directory given by its path in a given or a default filesystem. :param path_string: the path :param recursive: True to remove recursively :param filesystem: the filesystem
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.seek_stream | ( | self, | |
stream, | |||
offset, | |||
whence = os.SEEK_SET |
|||
) |
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file. :param stream: the opened stream :param offset: the position/offset :param whence: the direction
Reimplemented in plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs, and plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.
def plaso.tarzan.lib.hdfs.Hdfs.write_outputstream | ( | self, | |
output_stream, | |||
data | |||
) |
Write data buffer to a given opened output stream for a HDFS file. :param output_stream: the output-stream :param data: the data buffer to write :return: the number of bytes written
Reimplemented in plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs, and plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.
|
static |
|
static |