WebSiteCloner
index
websitecloner.py

This script implement a the WebSite Cloner with
GetURLsFromHTML class and WebSiteCloner class.

 
Modules
       
logging
ntpath

 
Classes
       
builtins.ValueError(builtins.Exception)
UrlError
builtins.object
WebSiteCloner
html.parser.HTMLParser(_markupbase.ParserBase)
GetURLsFromHTML

 
class GetURLsFromHTML(html.parser.HTMLParser)
    GetURLsFromHTML(master)
 
This class get URLs from HTML.
 
 
Method resolution order:
GetURLsFromHTML
html.parser.HTMLParser
_markupbase.ParserBase
builtins.object

Methods defined here:
__init__(self, master)
Initialize and reset this instance.
 
If convert_charrefs is True (the default), all character references
are automatically converted to the corresponding Unicode characters.
handle_starttag(self, tag: str, attributes: List[Tuple[str, str]]) -> None
This function get URLs attributes and values.

Methods inherited from html.parser.HTMLParser:
check_for_whole_start_tag(self, i)
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
clear_cdata_mode(self)
close(self)
Handle any buffered data.
feed(self, data)
Feed data to the parser.
 
Call this as often as you want, with as little or as much text
as you want (may include '\n').
get_starttag_text(self)
Return full source of start tag: '<...>'.
goahead(self, end)
# Internal -- handle data as far as reasonable.  May leave state
# and data to be processed by a subsequent call.  If 'end' is
# true, force handling all data as if followed by EOF marker.
handle_charref(self, name)
# Overridable -- handle character reference
handle_comment(self, data)
# Overridable -- handle comment
handle_data(self, data)
# Overridable -- handle data
handle_decl(self, decl)
# Overridable -- handle declaration
handle_endtag(self, tag)
# Overridable -- handle end tag
handle_entityref(self, name)
# Overridable -- handle entity reference
handle_pi(self, data)
# Overridable -- handle processing instruction
handle_startendtag(self, tag, attrs)
# Overridable -- finish processing of start+end tag: <tag.../>
parse_bogus_comment(self, i, report=1)
# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
parse_endtag(self, i)
# Internal -- parse endtag, return end or -1 if incomplete
parse_html_declaration(self, i)
# Internal -- parse html declarations, return length or -1 if not terminated
# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
# See also parse_declaration in _markupbase
parse_pi(self, i)
# Internal -- parse processing instr, return end or -1 if not terminated
parse_starttag(self, i)
# Internal -- handle starttag, return end or -1 if not terminated
reset(self)
Reset this instance.  Loses all unprocessed data.
set_cdata_mode(self, elem)
unknown_decl(self, data)

Data and other attributes inherited from html.parser.HTMLParser:
CDATA_CONTENT_ELEMENTS = ('script', 'style')

Methods inherited from _markupbase.ParserBase:
error(self, message)
getpos(self)
Return current line number and offset.
parse_comment(self, i, report=1)
# Internal -- parse comment, return length or -1 if not terminated
parse_declaration(self, i)
# Internal -- parse declaration (for use by subclasses).
parse_marked_section(self, i, report=1)
# Internal -- parse a marked section
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
updatepos(self, i, j)
# Internal -- update line number and offset.  This should be
# called for each piece of data exactly once, in order -- in other
# words the concatenation of all the input strings to this
# function should be exactly the entire input.

Data descriptors inherited from _markupbase.ParserBase:
__dict__
dictionary for instance variables (if defined)
__weakref__
list of weak references to the object (if defined)

 
class UrlError(builtins.ValueError)
    
Method resolution order:
UrlError
builtins.ValueError
builtins.Exception
builtins.BaseException
builtins.object

Data descriptors defined here:
__weakref__
list of weak references to the object (if defined)

Methods inherited from builtins.ValueError:
__init__(self, /, *args, **kwargs)
Initialize self.  See help(type(self)) for accurate signature.

Static methods inherited from builtins.ValueError:
__new__(*args, **kwargs) from builtins.type
Create and return a new object.  See help(type) for accurate signature.

Methods inherited from builtins.BaseException:
__delattr__(self, name, /)
Implement delattr(self, name).
__getattribute__(self, name, /)
Return getattr(self, name).
__reduce__(...)
Helper for pickle.
__repr__(self, /)
Return repr(self).
__setattr__(self, name, value, /)
Implement setattr(self, name, value).
__setstate__(...)
__str__(self, /)
Return str(self).
with_traceback(...)
Exception.with_traceback(tb) --
set self.__traceback__ to tb and return self.

Data descriptors inherited from builtins.BaseException:
__cause__
exception cause
__context__
exception context
__dict__
__suppress_context__
__traceback__
args

 
class WebSiteCloner(builtins.object)
    WebSiteCloner(url: str, directory: str = 'CloneWebSite', recursive: bool = False, replace_domain: str = None, loglevel: int = 20, logfile: str = None, replace_scheme: str = None)
 
This class Clone a Page and her ressources.
 
  Methods defined here:
__init__(self, url: str, directory: str = 'CloneWebSite', recursive: bool = False, replace_domain: str = None, loglevel: int = 20, logfile: str = None, replace_scheme: str = None)
Initialize self.  See help(type(self)) for accurate signature.
add_new_url(self, url: str) -> None
This function add new url to parse.
get_complete_url(self, url: str) -> str
This function build a complete url.
get_data(self, url: str) -> http.client.HTTPResponse
This function return HTTP response.
get_directory_from_url(self, url: str, url_parsed: urllib.parse.ParseResult) -> str
This function return directory to write the file.
get_full_path(self, url: str, directory: str, url_parsed: urllib.parse.ParseResult) -> str
This function return full path (directory + filename) to write the file.
launch(self) -> None
Launcher to copy a website.
write_file(self, url: str, data: bytes) -> bytes
This function get URL path and write the file in the good location.

Data descriptors defined here:
__dict__
dictionary for instance variables (if defined)
__weakref__
list of weak references to the object (if defined)

 
Functions
       
main()
parse()

 
Data
        List = typing.List
Tuple = typing.Tuple