ahk_webcopy || ahk_httrack || fullwebsite or webpage copy with python layer

Post your working scripts, libraries and tools.
sashaatx
Posts: 333
Joined: 27 May 2021, 08:27
Contact:

ahk_webcopy || ahk_httrack || fullwebsite or webpage copy with python layer

Post by sashaatx » 28 Mar 2023, 11:49

Always been a big fan of httrack, not always available/working/outofdate. I use it for template grabbing, asset grabbing, whatever white hat stuff I want from the web.

Here's a quick ahk script that calls a python library(Pywebcopy) compiled to 9mb. Download websites, webpages, assets, skip past robots. It handles threading, zips, html, whatever you need.


exe and ahk found here, no need to know python, compiled.
https://github.com/samfisherirl/ahk_pywebcopy/releases

Code: Select all

		;website or webpage || url || pathlocal to save || name
definitions := ["website", "http://nytimes.com", A_ScriptDir "\temp", "NameProj"]
definitions := ["webpage", "http://www.nysed.gov/college-transcripts", A_ScriptDir "\differentFolder", "DifferentName"]
full ahk code

Code: Select all


definitions := ["website", "http://nytimes.com", A_ScriptDir "\temp", "NameProj"]
				;website or webpage || url || pathlocal to save || name
definitions := ["webpage", "http://www.nysed.gov/college-transcripts", A_ScriptDir "\differentFolder", "DifferentName"]

obj := Web(definitions)
obj.runIt()


class Web
{
	__New(def) {
	;website or webpage || url || pathlocal to save || name
	this.config := A_ScriptDir "\command.txt"
	this.command := def[1]
	this.path_to_save := def[3]
	this.name := def[4]
	this.url := def[2]
	command_storage := this.command ",," this.url ",," this.path_to_save ",,", this.name ",,"
	FileAppend(command_storage, A_ScriptDir "\temp.txt")
	FileMove(A_ScriptDir "\temp.txt", this.config, 1)
	}

	runIt(){
		Run("ahk_webcopy.exe", A_ScriptDir)
	}

}



python code:

Code: Select all

from pywebcopy import save_webpage, save_website
from pathlib import Path
from os import mkdir

class Save:
    def __init__(self, path):
        self.path = path
        self.config = ""
        self.command = ""
        self.save_path = ""
        self.name = ""
        self.url = ""

    def get_config(self):
        with open(self.path, "r") as f:
            self.config = f.read()
        line = self.config.split(",,")
        self.command = line[0]
        self.save_path = line[2]
        try:
            mkdir(self.save_path)
        except Exception as e:
            print(e)
        self.url = line[1]
        self.name = line[3]
        if "webpage" in self.command:
            self.webpage()
        if "website" in self.command:
            self.website()
        
        
    def webpage(self):
        save_webpage(
        url=f"{self.url}",
        project_folder=f"{self.save_path}",
        project_name=f"{self.name}",
        bypass_robots=True,
        debug=True,
        open_in_browser=True,
        delay=None,
        threaded=False,
        )

        
    def website(self):
        save_website(
        url=f"{self.url}",
        project_folder=f"{self.save_path}",
        project_name=f"{self.name}",
        bypass_robots=True,
        debug=True,
        open_in_browser=True,
        delay=None,
        threaded=False,
        )

if __name__ == "__main__":
    p = Path.cwd()
    config = p / "command.txt"
    s = Save(config)
    s.get_config()
https://github.com/samfisherirl
? /Easy-Auto-GUI-for-AHK-v2 ? /Useful-AHK-v2-Libraries-and-Classes : /Pulovers-Macro-Creator-for-AHKv2 :

denbartman
Posts: 4
Joined: 12 Apr 2021, 18:02

Re: ahk_webcopy || ahk_httrack || fullwebsite or webpage copy with python layer

Post by denbartman » 16 Apr 2023, 18:16

Hi,

Thanx for this Script.
I noticed the script only processes the first parameter.
I made some changes to ahk_requests.ahk and ahk_requests.py.
Now it seems to handle multiple parameters.
It's my first time programming Pythons so it might still have some bugs.
In my tests it seams to work ok :-)
You can use the code to update your github page if you like...

kind regards,
Bart

ahk_requests.py

Code: Select all

from requests import get
from pathlib import Path
from os import remove
from jsons import load, dump
from datetime import datetime

class AHKRequest:
    def __init__(self):
        self.cwd = Path(__file__).parent.resolve()
        self.xtra = self.cwd / "xtra.txt"
        self.get = self.cwd / "get.txt"
        self.response = self.cwd / "response.txt"
        self.json = self.cwd / "jdata.json"
        self.redir = False
        self.stream = False
        self.headers = {}
        self.errors = self.cwd / "ERRORS.txt"
        self.params = {}
        self.responsetxt = ""
        self.responsejson = ""
        self.url = ""
        print(self.get)
        print(self.response)
        print(self.json)
        self.read_data()

    def read_data(self):
        print(self.cwd)
        self.readfirst()
        
        if self.get.is_file():
            with open(self.get, "r") as f:
                for i in f.readlines():
                    if "__hk__" in i:
                        hk=i.split("__hk__")[1]
                        hv=i.split("__hv__")[1]
                        print(hk)
                        print(hv)
                        self.headers[hk]=hv
                    elif "__pk__" in i:
                        pk=i.split("__pk__")[1]
                        pv=i.split("__pv__")[1]
                        print(pk)
                        print(pv)
                        self.params[pk]=pv
                    elif "__url__" in i:
                        self.url = i.split("__url__")[1]
                        print(self.url)
        if self.response.is_file():
            remove(self.response)
        if self.json.is_file():
            remove(self.json)
        if self.headers == {}:
            self.headers = self.default_headers()

    def default_headers(self):
        return {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
        
    def readfirst(self):
        if self.xtra.is_file():
            with open(self.xtra, "r") as f:
                for i in f.readlines():
                    if "allowRedirects" in i:
                        self.redir = AHKRequest.splitter(i)
                    elif "stream" in i:
                        self.stream = AHKRequest.splitter(i)
    
    @staticmethod
    def splitter(i):
        if "True" in i.split("==")[1]:
            return True
        elif "False" in i.split("==")[1]:
            return False

        
    def download_data(self):
        if self.params == {}:
            self.params = False
        response = get(url=self.url, 
                headers=self.headers, 
                params=self.params, 
                allow_redirects=self.redir, 
                stream=self.stream
                )
        self.responsetxt = str("".join(response.text.split("\n")))
        self.responsejson =  str(response.json())
        return

    @staticmethod
    def write_data(file, text):        
        if file.is_file():
            remove(file)
        with open(file, "w") as f:
            f.write(text)
        print(str(text))
        
if __name__ == "__main__":
    try:
        ahk = AHKRequest()
        ahk.download_data()
        ahk.write_data(ahk.json, ahk.responsejson)
        ahk.write_data(ahk.response, ahk.responsetxt)
        remove(ahk.get)
        remove(ahk.xtra)
    except Exception as e:
        time = str(datetime.now())
        with open(ahk.errors, "w") as f:
            f.write(str(e))
            f.write(str(f"\nat: {time}\n\n\n"))
            f.write(str(ahk.__dict__))

ahk_requests.ahk

Code: Select all

; #Include JXON.ahk

/*

url := "https://httpbin.org/get"
headers := Map("myheaderkey", "myheaderval")
params := False
; see bottom for additional params- #1

req := requests(url, headers)

params := Map("myparamskey", "myparamsval", "myparamskey2", "myparamsval2")
headers := False => converted to {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

req.allowRedirect := True ;optional
req.stream := True ;optional
this.py := A_ScriptDir "\ahk_requests.exe"


req.get()
msgbox(req.jdata["origin"])
msgbox(req.txt)
*/

class requests
/*
; this class encompasses setting and writing parameters to a temp file locally
; finally it runs the py exe and reads return values from the temp file
*/
{
    __New(url, headers := False, params := False) {
        this.url := url
        this.paramKeys := []
        this.paramVals := []
        this.headerKeys := []
        this.headerVals := []
        SplitPath(A_LineFile, ,&Dir)
        this.dir := Dir
        this.py := Dir "\ahk_requests.py"
        
        this.headers := this.headerEnum(headers)
        this.params := this.paramEnum(params)
        this.allowRedirects := False
        this.stream := False
        this.xtrapath := Dir "\xtra.txt"
        this.getpath := Dir "\get.txt"
        this.response := Dir "\response.txt"
        this.json := Dir "\jdata.json"
        this.txt := ""
        this.jdata := ""
        this.pid := ""
    }
    get(){
        this.appendXtra()
        this.appendParams()
        this.execute()
        this.waitResponse()
    }
    
    
    headerEnum(headers) {
        if (headers != False) or (headers != 0)
        {
            for key, value in headers
            {
                this.headerKeys.push("__hk__" . key . "__hk__")
                this.headerVals.push("__hv__" . value . "__hv__")
            }
            return True
        }
        return False
    }


    paramEnum(params) {
        if (params != False) or (params != 0)
        {
            for key, value in params
            {
                this.paramKeys.push("__pk__" . key . "__pk__")
                this.paramVals.push("__pv__" . value . "__pv__")
            }
            return True
        }
        return False
    }
    
    appendXtra(){
        
        if this.allowRedirects == True {
            this.allowRedirects := "allowRedirects==True"
        }
        else {
            this.allowRedirects := "allowRedirects==False`n"
        }
        if this.stream == True {
            this.stream := "stream==True"
        }
        else {
            this.stream := "stream==False`n"
        }
        try{
        FileDelete(this.xtrapath)
        }
        catch {
            sleep(1)
        }
        FileAppend(this.allowRedirects . this.stream, this.xtrapath)
        
    }
    appendParams(){
        
        this.url := "__url__" . this.url . "__url__"
        len := this.paramKeys.Length
        if (this.params != False) or (this.params != 0)
        {
            Loop len
            {
                this.params .= this.paramKeys[A_Index] . this.paramVals[A_Index] . "`n"
            }
        }
        else
        {
            this.params := ""
        }
        len := this.headerKeys.Length
        if (this.headers != False) or (this.headers != 0)
        {
            Loop len
            {
                this.headers .= this.headerKeys[A_Index] .  this.headerVals[A_Index] . "`n"
            }
        }
        else
        {
            this.headers := ""
        }
        try{
        FileDelete(this.getpath)
        }
        catch {
            sleep(1)
        }
        FileAppend(this.url . "`n" . this.params . "`n" . this.headers, this.getpath)
    }
    
    execute(){       
        try{
            FileDelete(this.response)
            }
        catch {
            sleep(1)
        }
        try{
            FileDelete(this.json)
            }
        catch {
            sleep(1)
        }
        Run(this.py, this.dir,"Hide",&pid)
        this.pid := pid
        
    }
    waitResponse(){
        txt:=0
        jdata:=0
        loop 100 {
            sleep(200)
            if (txt==0){
                if not FileExist(this.xtrapath) {
                    this.txt := FileRead(this.response)
                    txt := 1
                }
            }
            if (jdata==0){
                if not FileExist(this.getpath) {
                    jdata := FileRead(this.response)
                    this.jdata := Jxon_Load(&jdata)
                    jdata := 1
                }
            }
            if (jdata==1) and (txt==1){
                break
            }
        }
    }

}

/*
_________________________________________
appendex of sorts -- additional parameters corresponding to the see-bottom numbers above
_________________________________________


___________________1_____________________
other methods
 headers := False
 params := Map("myparamskey", "myparamsval", "myparamskey2", "myparamsval2")
 headers := False => converted to {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
 
 or:
 headers := False => converted to {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
 
 params := False


___________________1_____________________

sashaatx
Posts: 333
Joined: 27 May 2021, 08:27
Contact:

Re: ahk_webcopy || ahk_httrack || fullwebsite or webpage copy with python layer

Post by sashaatx » 16 Apr 2023, 21:26

denbartman wrote:
16 Apr 2023, 18:16
Hi,

Thanx for this Script.
I noticed the script only processes the first parameter.
I made some changes to ahk_requests.ahk and ahk_requests.py.
Now it seems to handle multiple parameters.
It's my first time programming Pythons so it might still have some bugs.
In my tests it seams to work ok :-)
You can use the code to update your github page if you like...

kind regards,
Bart

ahk_requests.py

Code: Select all

from requests import get
from pathlib import Path
from os import remove
from jsons import load, dump
from datetime import datetime

class AHKRequest:
    def __init__(self):
        self.cwd = Path(__file__).parent.resolve()
        self.xtra = self.cwd / "xtra.txt"
        self.get = self.cwd / "get.txt"
        self.response = self.cwd / "response.txt"
        self.json = self.cwd / "jdata.json"
        self.redir = False
        self.stream = False
        self.headers = {}
        self.errors = self.cwd / "ERRORS.txt"
        self.params = {}
        self.responsetxt = ""
        self.responsejson = ""
        self.url = ""
        print(self.get)
        print(self.response)
        print(self.json)
        self.read_data()

    def read_data(self):
        print(self.cwd)
        self.readfirst()
        
        if self.get.is_file():
            with open(self.get, "r") as f:
                for i in f.readlines():
                    if "__hk__" in i:
                        hk=i.split("__hk__")[1]
                        hv=i.split("__hv__")[1]
                        print(hk)
                        print(hv)
                        self.headers[hk]=hv
                    elif "__pk__" in i:
                        pk=i.split("__pk__")[1]
                        pv=i.split("__pv__")[1]
                        print(pk)
                        print(pv)
                        self.params[pk]=pv
                    elif "__url__" in i:
                        self.url = i.split("__url__")[1]
                        print(self.url)
        if self.response.is_file():
            remove(self.response)
        if self.json.is_file():
            remove(self.json)
        if self.headers == {}:
            self.headers = self.default_headers()

    def default_headers(self):
        return {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
        
    def readfirst(self):
        if self.xtra.is_file():
            with open(self.xtra, "r") as f:
                for i in f.readlines():
                    if "allowRedirects" in i:
                        self.redir = AHKRequest.splitter(i)
                    elif "stream" in i:
                        self.stream = AHKRequest.splitter(i)
    
    @staticmethod
    def splitter(i):
        if "True" in i.split("==")[1]:
            return True
        elif "False" in i.split("==")[1]:
            return False

        
    def download_data(self):
        if self.params == {}:
            self.params = False
        response = get(url=self.url, 
                headers=self.headers, 
                params=self.params, 
                allow_redirects=self.redir, 
                stream=self.stream
                )
        self.responsetxt = str("".join(response.text.split("\n")))
        self.responsejson =  str(response.json())
        return

    @staticmethod
    def write_data(file, text):        
        if file.is_file():
            remove(file)
        with open(file, "w") as f:
            f.write(text)
        print(str(text))
        
if __name__ == "__main__":
    try:
        ahk = AHKRequest()
        ahk.download_data()
        ahk.write_data(ahk.json, ahk.responsejson)
        ahk.write_data(ahk.response, ahk.responsetxt)
        remove(ahk.get)
        remove(ahk.xtra)
    except Exception as e:
        time = str(datetime.now())
        with open(ahk.errors, "w") as f:
            f.write(str(e))
            f.write(str(f"\nat: {time}\n\n\n"))
            f.write(str(ahk.__dict__))

ahk_requests.ahk

Code: Select all

; #Include JXON.ahk

/*

url := "https://httpbin.org/get"
headers := Map("myheaderkey", "myheaderval")
params := False
; see bottom for additional params- #1

req := requests(url, headers)

params := Map("myparamskey", "myparamsval", "myparamskey2", "myparamsval2")
headers := False => converted to {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

req.allowRedirect := True ;optional
req.stream := True ;optional
this.py := A_ScriptDir "\ahk_requests.exe"


req.get()
msgbox(req.jdata["origin"])
msgbox(req.txt)
*/

class requests
/*
; this class encompasses setting and writing parameters to a temp file locally
; finally it runs the py exe and reads return values from the temp file
*/
{
    __New(url, headers := False, params := False) {
        this.url := url
        this.paramKeys := []
        this.paramVals := []
        this.headerKeys := []
        this.headerVals := []
        SplitPath(A_LineFile, ,&Dir)
        this.dir := Dir
        this.py := Dir "\ahk_requests.py"
        
        this.headers := this.headerEnum(headers)
        this.params := this.paramEnum(params)
        this.allowRedirects := False
        this.stream := False
        this.xtrapath := Dir "\xtra.txt"
        this.getpath := Dir "\get.txt"
        this.response := Dir "\response.txt"
        this.json := Dir "\jdata.json"
        this.txt := ""
        this.jdata := ""
        this.pid := ""
    }
    get(){
        this.appendXtra()
        this.appendParams()
        this.execute()
        this.waitResponse()
    }
    
    
    headerEnum(headers) {
        if (headers != False) or (headers != 0)
        {
            for key, value in headers
            {
                this.headerKeys.push("__hk__" . key . "__hk__")
                this.headerVals.push("__hv__" . value . "__hv__")
            }
            return True
        }
        return False
    }


    paramEnum(params) {
        if (params != False) or (params != 0)
        {
            for key, value in params
            {
                this.paramKeys.push("__pk__" . key . "__pk__")
                this.paramVals.push("__pv__" . value . "__pv__")
            }
            return True
        }
        return False
    }
    
    appendXtra(){
        
        if this.allowRedirects == True {
            this.allowRedirects := "allowRedirects==True"
        }
        else {
            this.allowRedirects := "allowRedirects==False`n"
        }
        if this.stream == True {
            this.stream := "stream==True"
        }
        else {
            this.stream := "stream==False`n"
        }
        try{
        FileDelete(this.xtrapath)
        }
        catch {
            sleep(1)
        }
        FileAppend(this.allowRedirects . this.stream, this.xtrapath)
        
    }
    appendParams(){
        
        this.url := "__url__" . this.url . "__url__"
        len := this.paramKeys.Length
        if (this.params != False) or (this.params != 0)
        {
            Loop len
            {
                this.params .= this.paramKeys[A_Index] . this.paramVals[A_Index] . "`n"
            }
        }
        else
        {
            this.params := ""
        }
        len := this.headerKeys.Length
        if (this.headers != False) or (this.headers != 0)
        {
            Loop len
            {
                this.headers .= this.headerKeys[A_Index] .  this.headerVals[A_Index] . "`n"
            }
        }
        else
        {
            this.headers := ""
        }
        try{
        FileDelete(this.getpath)
        }
        catch {
            sleep(1)
        }
        FileAppend(this.url . "`n" . this.params . "`n" . this.headers, this.getpath)
    }
    
    execute(){       
        try{
            FileDelete(this.response)
            }
        catch {
            sleep(1)
        }
        try{
            FileDelete(this.json)
            }
        catch {
            sleep(1)
        }
        Run(this.py, this.dir,"Hide",&pid)
        this.pid := pid
        
    }
    waitResponse(){
        txt:=0
        jdata:=0
        loop 100 {
            sleep(200)
            if (txt==0){
                if not FileExist(this.xtrapath) {
                    this.txt := FileRead(this.response)
                    txt := 1
                }
            }
            if (jdata==0){
                if not FileExist(this.getpath) {
                    jdata := FileRead(this.response)
                    this.jdata := Jxon_Load(&jdata)
                    jdata := 1
                }
            }
            if (jdata==1) and (txt==1){
                break
            }
        }
    }

}

/*
_________________________________________
appendex of sorts -- additional parameters corresponding to the see-bottom numbers above
_________________________________________


___________________1_____________________
other methods
 headers := False
 params := Map("myparamskey", "myparamsval", "myparamskey2", "myparamsval2")
 headers := False => converted to {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
 
 or:
 headers := False => converted to {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
 
 params := False


___________________1_____________________
thank you. I'll update, I need to replace the strange keyvalue pair function with json.
https://github.com/samfisherirl
? /Easy-Auto-GUI-for-AHK-v2 ? /Useful-AHK-v2-Libraries-and-Classes : /Pulovers-Macro-Creator-for-AHKv2 :

Post Reply

Return to “Scripts and Functions (v2)”