PySpark Plaso  Release 2019
A tool for distributed extraction of timestamps from various files using extractors adapted from the Plaso engine to Apache Spark.
Public Member Functions | Static Public Attributes | List of all members
plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs Class Reference
Inheritance diagram for plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs:
Inheritance graph
[legend]
Collaboration diagram for plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs:
Collaboration graph
[legend]

Public Member Functions

def open_filesystem (self, hdfs_uri, user="hadoop")
 
def close_filesystem (self, filesystem=None)
 
def get_filesystem (self, force_filesystem=None)
 
def exists (self, path_string, filesystem=None)
 
def info (self, path_string, filesystem=None)
 
def remove (self, path_string, recursive=True, filesystem=None)
 
def mkdir (self, path_string, filesystem=None)
 
def list_files (self, path_string, recursion_level=0, include_dir_names=False, filesystem=None)
 
def open_inputstream (self, path, filesystem=None)
 
def open_outputstream (self, path, filesystem=None)
 
def close_stream (self, input_stream)
 
def get_stream_offset (self, input_stream)
 
def get_path_size (self, path, filesystem=None)
 
def read_inputstream (self, input_stream, size=None)
 
def write_outputstream (self, output_stream, data)
 
def pass_to_outputstream (self, output_stream, input_stream)
 
def seek_stream (self, stream, offset, whence=os.SEEK_SET)
 
- Public Member Functions inherited from plaso.tarzan.lib.hdfs.Hdfs
def make_uri (self, filesystem=None)
 
def make_path (self, path_string, qualified=True, filesystem=None)
 
def make_simple_path (self, path_string, filesystem=None)
 
def make_qualified_path (self, path_string, filesystem=None)
 
def append_to_path (self, original_path, new_child_string)
 
def basename (self, path_string)
 
def dirname (self, path_string)
 

Static Public Attributes

 fs = None
 
- Static Public Attributes inherited from plaso.tarzan.lib.hdfs.Hdfs
string PATH_SEPARATOR = '/'
 
string SCHEME = 'hdfs'
 

Additional Inherited Members

- Static Public Member Functions inherited from plaso.tarzan.lib.hdfs.Hdfs
def parse_uri (hdfs_uri)
 

Detailed Description

HDFS driver utilizing PyArrow library.

Member Function Documentation

◆ close_filesystem()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.close_filesystem (   self,
  filesystem = None 
)
Close a given HDFS filesystem.
:param filesystem: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ close_stream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.close_stream (   self,
  input_stream 
)
Close a given (previously opened) input-stream for a HDFS file.
:param input_stream: the opened input-stream

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ exists()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.exists (   self,
  path_string,
  filesystem = None 
)
Check if a given HDFS path exists in the given HDFS filesystem.
:param path_string: the path
:param filesystem: the filesystem
:return: True if the path exists in the filesystem, False otherwise

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ get_filesystem()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.get_filesystem (   self,
  force_filesystem = None 
)
Get a given or a default HDFS filesystem.
:param force_filesystem: the given filesystem
:return: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ get_path_size()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.get_path_size (   self,
  path,
  filesystem = None 
)
Get the size of a given HDFS path (a file) in a given filesystem.
:param path: the path
:param filesystem: the filesystem
:return: the size

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ get_stream_offset()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.get_stream_offset (   self,
  input_stream 
)
Get the current position (an offset) in a given (previously opened) input-stream for a HDFS file.
:param input_stream: the opened input-stream
:return: the position/offset

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ info()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.info (   self,
  path_string,
  filesystem = None 
)
Get metadata of a given path in a given or a default filesystem.
:param path_string: the path
:param filesystem: the filesystem
:return: the metadata dictionary

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ list_files()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.list_files (   self,
  path_string,
  recursion_level = 0,
  include_dir_names = False,
  filesystem = None 
)
Get a list of files (optionally recursively to the specified level) in a given HDFS path of a given filesystem.
:param path_string: the path
:param recursion_level: the level for the recursion (0 is just a current directory without any recursion)
:param include_dir_names: True to include also names of directories (suffixed by /)
:param filesystem: the filesystem
:return: the list of files in the path

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ mkdir()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.mkdir (   self,
  path_string,
  filesystem = None 
)
Make a new directory (if does not exist) of a given path in a given or a default filesystem.
:param path_string: the path
:param filesystem: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ open_filesystem()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.open_filesystem (   self,
  hdfs_uri,
  user = "hadoop" 
)
Open HDFS filesystem of a given URI and as a given HDFS user.
:param hdfs_uri: HDFS URI to open
:param user: HDFS user to act as when opening
:return: the opened filesystem as an object of pyarrow.HadoopFileSystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ open_inputstream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.open_inputstream (   self,
  path,
  filesystem = None 
)
Open and get an input-stream for a HDFS file given by its path in a given filesystem.
:param path: the path
:param filesystem: the filesystem
:return: the opened input-stream

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ open_outputstream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.open_outputstream (   self,
  path,
  filesystem = None 
)
Open and get an output-stream for a HDFS file given by its path in a given filesystem.
:param path: the path
:param filesystem: the filesystem
:return: the opened output-stream

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ pass_to_outputstream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.pass_to_outputstream (   self,
  output_stream,
  input_stream 
)
Pass data from the input stream to the output stream to a HDFS file.
:param output_stream: the output-stream of the HDFS file
:param input_stream: the input-stream

◆ read_inputstream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.read_inputstream (   self,
  input_stream,
  size = None 
)
Read data form a given opened input stream for a HDFS file.
:param input_stream: the input-stream
:param size: the size of data to read
:return: the data

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ remove()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.remove (   self,
  path_string,
  recursive = True,
  filesystem = None 
)
Remove (optionally recursively) a HDFS file/directory given by its path in a given or a default filesystem.
:param path_string: the path
:param recursive: True to remove recursively
:param filesystem: the filesystem

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ seek_stream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.seek_stream (   self,
  stream,
  offset,
  whence = os.SEEK_SET 
)
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file.
:param stream: the opened stream
:param offset: the position/offset
:param whence: the direction

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

◆ write_outputstream()

def plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.write_outputstream (   self,
  output_stream,
  data 
)
Write data buffer to a given opened output stream for a HDFS file.
:param output_stream: the output-stream
:param data: the data buffer to write
:return: the number of bytes written

Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.

Member Data Documentation

◆ fs

plaso.tarzan.lib.pyarrow_hdfs.PyArrowHdfs.fs = None
static

The documentation for this class was generated from the following file: