PySpark Plaso
Release 2019
A tool for distributed extraction of timestamps from various files using extractors adapted from the Plaso engine to Apache Spark.
|
Public Member Functions | |
def | __init__ (self, spark_context) |
def | open_filesystem (self, hdfs_uri, user="hadoop") |
def | close_filesystem (self, filesystem=None) |
def | make_path (self, path_string, qualified=True, filesystem=None) |
def | append_to_path (self, original_path, new_child_string) |
def | list_files (self, path_string, recursion_level=0, include_dir_names=False, filesystem=None) |
def | open_inputstream (self, path, filesystem=None) |
def | close_stream (self, input_stream) |
def | get_stream_offset (self, input_stream) |
def | get_path_size (self, path, filesystem=None) |
def | read_inputstream (self, input_stream, size=sys.maxsize) |
def | read_inputstream_with_chunk (self, input_stream, size=sys.maxsize, chunk_size=2048) |
def | seek_stream (self, stream, offset, whence=os.SEEK_SET) |
def | seek_stream_with_path (self, stream, offset, whence=os.SEEK_SET, path=None) |
def | get_filesystem (self, force_filesystem) |
def | exists (self, path_string, filesystem) |
def | info (self, path_string, filesystem=None) |
def | remove (self, path_string, recursive=True, filesystem=None) |
def | mkdir (self, path_string, filesystem=None) |
def | open_outputstream (self, path, filesystem=None) |
def | write_outputstream (self, output_stream, data) |
![]() | |
def | make_uri (self, filesystem=None) |
def | make_simple_path (self, path_string, filesystem=None) |
def | make_qualified_path (self, path_string, filesystem=None) |
def | basename (self, path_string) |
def | dirname (self, path_string) |
Public Attributes | |
spark_context | |
uri_class | |
path_class | |
fs_class | |
isr_class | |
fs | |
Additional Inherited Members | |
![]() | |
def | parse_uri (hdfs_uri) |
![]() | |
string | PATH_SEPARATOR = '/' |
string | SCHEME = 'hdfs' |
HDFS driver utilizing JVM gateway of the Spark Context.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.__init__ | ( | self, | |
spark_context | |||
) |
Initialize the driver. :param spark_context: the Spark Context
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.append_to_path | ( | self, | |
original_path, | |||
new_child_string | |||
) |
Append a path/directory/file into another HDFS path. :param original_path: the another path :param new_child_string: the path to append :return: the resulting path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.close_filesystem | ( | self, | |
filesystem = None |
|||
) |
Close a given HDFS filesystem. :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.close_stream | ( | self, | |
input_stream | |||
) |
Close a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.exists | ( | self, | |
path_string, | |||
filesystem | |||
) |
Check if a given HDFS path exists in the given HDFS filesystem. :param path_string: the path :param filesystem: the filesystem :return: True if the path exists in the filesystem, False otherwise
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_filesystem | ( | self, | |
force_filesystem | |||
) |
Get a given or a default HDFS filesystem. :param force_filesystem: the given filesystem :return: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_path_size | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Get the size of a given HDFS path (a file) in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the size
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_stream_offset | ( | self, | |
input_stream | |||
) |
Get the current position (an offset) in a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream :return: the position/offset
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.info | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Get metadata of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem :return: the metadata dictionary
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.list_files | ( | self, | |
path_string, | |||
recursion_level = 0 , |
|||
include_dir_names = False , |
|||
filesystem = None |
|||
) |
Get a list of files (optionally recursively to the specified level) in a given HDFS path of a given filesystem. :param path_string: the path :param recursion_level: the level for the recursion (0 is just a current directory without any recursion) :param include_dir_names: True to include also names of directories (suffixed by /) :param filesystem: the filesystem :return: the list of files in the path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.make_path | ( | self, | |
path_string, | |||
qualified = True , |
|||
filesystem = None |
|||
) |
Get a (qualified) HDFS URI from a given path and a given or a default filesystem. :param path_string: the path :param qualified: True to get the qualified path :param filesystem: the filesystem :return: the resulting path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.mkdir | ( | self, | |
path_string, | |||
filesystem = None |
|||
) |
Make a new directory (if does not exist) of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_filesystem | ( | self, | |
hdfs_uri, | |||
user = "hadoop" |
|||
) |
Open HDFS filesystem of a given URI and as a given HDFS user. :param hdfs_uri: HDFS URI to open :param user: HDFS user to act as when opening :return: the opened filesystem as an object of org.apache.hadoop.fs.FileSystem class
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_inputstream | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Open and get an input-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened input-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_outputstream | ( | self, | |
path, | |||
filesystem = None |
|||
) |
Open and get an output-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened output-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.read_inputstream | ( | self, | |
input_stream, | |||
size = sys.maxsize |
|||
) |
Read data form a given opened input stream for a HDFS file. :param input_stream: the input-stream :param size: the size of data to read :return: the data
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.read_inputstream_with_chunk | ( | self, | |
input_stream, | |||
size = sys.maxsize , |
|||
chunk_size = 2048 |
|||
) |
Read data form a given opened input stream for a HDFS file. :param input_stream: the input-stream :param size: the size of data to read :param chunk_size: the size of a chunk :return: the data
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.remove | ( | self, | |
path_string, | |||
recursive = True , |
|||
filesystem = None |
|||
) |
Remove (optionally recursively) a HDFS file/directory given by its path in a given or a default filesystem. :param path_string: the path :param recursive: True to remove recursively :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.seek_stream | ( | self, | |
stream, | |||
offset, | |||
whence = os.SEEK_SET |
|||
) |
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file. :param stream: the opened stream :param offset: the position/offset :param whence: the direction
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.seek_stream_with_path | ( | self, | |
stream, | |||
offset, | |||
whence = os.SEEK_SET , |
|||
path = None |
|||
) |
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file. :param stream: the opened stream :param offset: the position/offset :param whence: the direction :param path: the path of the file required to be able to seek from the end of the file
def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.write_outputstream | ( | self, | |
output_stream, | |||
data | |||
) |
Write data buffer to a given opened output stream for a HDFS file. :param output_stream: the output-stream :param data: the data buffer to write :return: the number of bytes written
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.fs |
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.fs_class |
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.isr_class |
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.path_class |
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.spark_context |
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.uri_class |