PySpark Plaso  Release 2019
A tool for distributed extraction of timestamps from various files using extractors adapted from the Plaso engine to Apache Spark.
Public Member Functions | Public Attributes | List of all members
plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs Class Reference
Inheritance diagram for plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs:
Inheritance graph
[legend]
Collaboration diagram for plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs:
Collaboration graph
[legend]

Public Member Functions

def __init__ (self, spark_context)
 
def open_filesystem (self, hdfs_uri, user="hadoop")
 
def close_filesystem (self, filesystem=None)
 
def make_path (self, path_string, qualified=True, filesystem=None)
 
def append_to_path (self, original_path, new_child_string)
 
def list_files (self, path_string, recursion_level=0, include_dir_names=False, filesystem=None)
 
def open_inputstream (self, path, filesystem=None)
 
def close_stream (self, input_stream)
 
def get_stream_offset (self, input_stream)
 
def get_path_size (self, path, filesystem=None)
 
def read_inputstream (self, input_stream, size=sys.maxsize)
 
def read_inputstream_with_chunk (self, input_stream, size=sys.maxsize, chunk_size=2048)
 
def seek_stream (self, stream, offset, whence=os.SEEK_SET)
 
def seek_stream_with_path (self, stream, offset, whence=os.SEEK_SET, path=None)
 
def get_filesystem (self, force_filesystem)
 
def exists (self, path_string, filesystem)
 
def info (self, path_string, filesystem=None)
 
def remove (self, path_string, recursive=True, filesystem=None)
 
def mkdir (self, path_string, filesystem=None)
 
def open_outputstream (self, path, filesystem=None)
 
def write_outputstream (self, output_stream, data)
 
- Public Member Functions inherited from plaso.tarzan.lib.hdfs.Hdfs
def make_uri (self, filesystem=None)
 
def make_simple_path (self, path_string, filesystem=None)
 
def make_qualified_path (self, path_string, filesystem=None)
 
def basename (self, path_string)
 
def dirname (self, path_string)
 

Public Attributes

 spark_context
 
 uri_class
 
 path_class
 
 fs_class
 
 isr_class
 
 fs
 

Additional Inherited Members

- Static Public Member Functions inherited from plaso.tarzan.lib.hdfs.Hdfs
def parse_uri (hdfs_uri)
 
- Static Public Attributes inherited from plaso.tarzan.lib.hdfs.Hdfs
string PATH_SEPARATOR = '/'
 
string SCHEME = 'hdfs'
 

Detailed Description

HDFS driver utilizing JVM gateway of the Spark Context.

Constructor & Destructor Documentation

◆ __init__()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.__init__ (   self,
  spark_context 
)
Initialize the driver.
:param spark_context: the Spark Context

Member Function Documentation

◆ append_to_path()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.append_to_path (   self,
  original_path,
  new_child_string 
)
Append a path/directory/file into another HDFS path.
:param original_path: the another path
:param new_child_string: the path to append
:return: the resulting path

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ close_filesystem()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.close_filesystem (   self,
  filesystem = None 
)
Close a given HDFS filesystem.
:param filesystem: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ close_stream()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.close_stream (   self,
  input_stream 
)
Close a given (previously opened) input-stream for a HDFS file.
:param input_stream: the opened input-stream

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ exists()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.exists (   self,
  path_string,
  filesystem 
)
Check if a given HDFS path exists in the given HDFS filesystem.
:param path_string: the path
:param filesystem: the filesystem
:return: True if the path exists in the filesystem, False otherwise

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ get_filesystem()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_filesystem (   self,
  force_filesystem 
)
Get a given or a default HDFS filesystem.
:param force_filesystem: the given filesystem
:return: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ get_path_size()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_path_size (   self,
  path,
  filesystem = None 
)
Get the size of a given HDFS path (a file) in a given filesystem.
:param path: the path
:param filesystem: the filesystem
:return: the size

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ get_stream_offset()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_stream_offset (   self,
  input_stream 
)
Get the current position (an offset) in a given (previously opened) input-stream for a HDFS file.
:param input_stream: the opened input-stream
:return: the position/offset

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ info()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.info (   self,
  path_string,
  filesystem = None 
)
Get metadata of a given path in a given or a default filesystem.
:param path_string: the path
:param filesystem: the filesystem
:return: the metadata dictionary

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ list_files()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.list_files (   self,
  path_string,
  recursion_level = 0,
  include_dir_names = False,
  filesystem = None 
)
Get a list of files (optionally recursively to the specified level) in a given HDFS path of a given filesystem.
:param path_string: the path
:param recursion_level: the level for the recursion (0 is just a current directory without any recursion)
:param include_dir_names: True to include also names of directories (suffixed by /)
:param filesystem: the filesystem
:return: the list of files in the path

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ make_path()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.make_path (   self,
  path_string,
  qualified = True,
  filesystem = None 
)
Get a (qualified) HDFS URI from a given path and a given or a default filesystem.
:param path_string: the path
:param qualified: True to get the qualified path
:param filesystem: the filesystem
:return: the resulting path

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ mkdir()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.mkdir (   self,
  path_string,
  filesystem = None 
)
Make a new directory (if does not exist) of a given path in a given or a default filesystem.
:param path_string: the path
:param filesystem: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ open_filesystem()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_filesystem (   self,
  hdfs_uri,
  user = "hadoop" 
)
Open HDFS filesystem of a given URI and as a given HDFS user.
:param hdfs_uri: HDFS URI to open
:param user: HDFS user to act as when opening
:return: the opened filesystem as an object of org.apache.hadoop.fs.FileSystem class

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ open_inputstream()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_inputstream (   self,
  path,
  filesystem = None 
)
Open and get an input-stream for a HDFS file given by its path in a given filesystem.
:param path: the path
:param filesystem: the filesystem
:return: the opened input-stream

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ open_outputstream()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_outputstream (   self,
  path,
  filesystem = None 
)
Open and get an output-stream for a HDFS file given by its path in a given filesystem.
:param path: the path
:param filesystem: the filesystem
:return: the opened output-stream

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ read_inputstream()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.read_inputstream (   self,
  input_stream,
  size = sys.maxsize 
)
Read data form a given opened input stream for a HDFS file.
:param input_stream: the input-stream
:param size: the size of data to read
:return: the data

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ read_inputstream_with_chunk()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.read_inputstream_with_chunk (   self,
  input_stream,
  size = sys.maxsize,
  chunk_size = 2048 
)
Read data form a given opened input stream for a HDFS file.
:param input_stream: the input-stream
:param size: the size of data to read
:param chunk_size: the size of a chunk
:return: the data

◆ remove()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.remove (   self,
  path_string,
  recursive = True,
  filesystem = None 
)
Remove (optionally recursively) a HDFS file/directory given by its path in a given or a default filesystem.
:param path_string: the path
:param recursive: True to remove recursively
:param filesystem: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ seek_stream()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.seek_stream (   self,
  stream,
  offset,
  whence = os.SEEK_SET 
)
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file.
:param stream: the opened stream
:param offset: the position/offset
:param whence: the direction

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ seek_stream_with_path()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.seek_stream_with_path (   self,
  stream,
  offset,
  whence = os.SEEK_SET,
  path = None 
)
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file.
:param stream: the opened stream
:param offset: the position/offset
:param whence: the direction
:param path: the path of the file required to be able to seek from the end of the file

◆ write_outputstream()

def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.write_outputstream (   self,
  output_stream,
  data 
)
Write data buffer to a given opened output stream for a HDFS file.
:param output_stream: the output-stream
:param data: the data buffer to write
:return: the number of bytes written

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

Member Data Documentation

◆ fs

plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.fs

◆ fs_class

plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.fs_class

◆ isr_class

plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.isr_class

◆ path_class

plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.path_class

◆ spark_context

plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.spark_context

◆ uri_class

plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.uri_class

The documentation for this class was generated from the following file: