This off the top of my head:
def canonical_url(u):
u = u.lower()
if u.startswith("http://"):
u = u[7: ]
if u.startswith("www."):
u = u[4: ]
if u.endswith("/"):
u = u[: -1]
return u
def same_urls(u1, u2):
return canonical_url(u1) == canonical_url(u2)
You could look up the names using dns and see if they point to the same ip. Some minor string processing may be required to remove confusing chars.
from socket import gethostbyname_ex urls = ['http://google.com', 'google.com/', 'www.google.com/', 'news.google.com'] data = [] for orginalName in urls: print 'url:', orginalName name = orginalName.strip() name = name.replace('http://', '') name = name.replace('http:', '') if name.find('/') > 0: name = name[: name.find('/')] if name.find('\\') > 0: name = name[: name.find('\\')] print 'dns lookup:', name if name: try: result = gethostbyname_ex(name) except: continue # Unable to resolve for ip in result[2]: print 'ip:', ip data.append((ip, orginalName)) print data
result:
url: http: //google.com
dns lookup: google.com
ip: 66.102 .11 .104
url: google.com /
dns lookup: google.com
ip: 66.102 .11 .104
url: www.google.com /
dns lookup: www.google.com
ip: 66.102 .11 .104
url: news.google.com
dns lookup: news.google.com
ip: 66.102 .11 .104[('66.102.11.104', 'http://google.com'), ('66.102.11.104', 'google.com/'), ('66.102.11.104', 'www.google.com/'), ('66.102.11.104', 'news.google.com')]
Code:
from w3lib.url
import url_query_cleaner
from url_normalize
import url_normalize
urls = ['google.com',
'google.com/',
'http://google.com/',
'http://google.com',
'http://google.com?',
'http://google.com/?',
'http://google.com//',
'http://google.com?utm_source=Google'
]
def canonical_url(u):
u = url_normalize(u)
u = url_query_cleaner(u, parameterlist = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content'], remove = True)
if u.startswith("http://"):
u = u[7: ]
if u.startswith("https://"):
u = u[8: ]
if u.startswith("www."):
u = u[4: ]
if u.endswith("/"):
u = u[: -1]
return u
list(map(canonical_url, urls))
Result:
['google.com',
'google.com',
'google.com',
'google.com',
'google.com',
'google.com',
'google.com',
'google.com'
]
A URL canonicalization (normalization) library for Python and Java.,A URL parser which preserves the input bytes exactly,A precanned canonicalization ruleset that tries to match the normalization implicit in the parsing rules used by browsers, url canonicalization library for python and java
>>> import urlcanon
>>> input_url = "http://///EXAMPLE.com:80/foo/../bar"
>>> parsed_url = urlcanon.parse_url(input_url)
>>> print(parsed_url)
http://///EXAMPLE.com:80/foo/../bar
>>> urlcanon.whatwg(parsed_url)
<urlcanon.parse.ParsedUrl object at 0x10eb13a58>
>>> print(parsed_url)
http://example.com/bar
>>> print(parsed_url.ssurt())
b'com,example,//:http/bar'
>>>
>>> rule = urlcanon.MatchRule(ssurt=b'com,example,//:http/bar')
>>> urlcanon.whatwg.rule_applies(rule, b'https://example..com/bar/baz')
False
>>> urlcanon.whatwg.rule_applies(rule, b'HTtp:////eXAMple.Com/bar//baz//..///quu')
True
pip install urlcanon
String inputUrl = "http://///EXAMPLE.com:80/foo/../bar";
ParsedUrl parsedUrl = ParsedUrl.parseUrl(inputUrl);
System.out.println(parsedUrl);
// http://///EXAMPLE.com:80/foo/../bar
Canonicalizer.WHATWG.canonicalize(parsedUrl);
System.out.println(parsedUrl);
// http://example.com/bar
System.out.println(parsedUrl.ssurt());
// "com,example,//:http/bar"
<dependency>
<groupId>org.netpreserve</groupId>
<artifactId>urlcanon</artifactId>
<version>0.4.0</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>53.1</version>
</dependency>
When there are multiple versions of a page, make sure to point the canonical URL to the preferred version which you want to have indexed. When a canonical URL references another URL, this tells search engines:,The canonical URL is a very powerful tool in a webmaster's toolbox. It's vital to stick to the best practices below when working with canonical URLs in order to prevent indexing issues:,If there are multiple versions of a page, make sure the canonical URL is referencing the page you prefer to be indexed by search engines.,The canonical URL can be used to prevent duplicate content in cases where the duplicate content issues go beyond a single website. When content is published on several pages, on several domains the cross-domain canonical URL can be used to signal to search engines which version of the page should be indexed.
<head>
<link rel="canonical" href="https://www.contentkingapp.com/" />
</head>
HTTP/1.1 200 OK
Server: nginx
Date: Thu, 28 Apr 2016 11:54:25 GMT
Content-Type: application/pdf
Content-Length: 23629
Last-Modified: Fri, 29 Apr 2016 17:47:17 GMT
Link: <http: //www.example.com/downloads/whitepaper.pdf>; rel="canonical"
<head>
<link rel="canonical" href="https://www.example.com/" />
<link rel="alternate" href="https://m.example.com/" />
</head>
<head>
<link rel="canonical" href="https://www.example.com/" />
</head>
<link rel="canonical" href="https://www.example.com/services/repairs/">
<link rel="canonical" href="repairs/">
One option for the query API is to put all request parameters in the query string. For example, you can do this for Amazon S3 to create a presigned URL. In that case, the canonical query string must include not only parameters for the request, but also the parameters used as part of the signing process—the hashing algorithm, credential scope, date, and signed headers parameters.,The following example shows a query string that includes authentication information. The example is formatted with line breaks for readability, but the canonical query string must be one continuous line of text in your code.,You include the hashed canonical request as part of the string to sign in Task 2: Create a string to sign for Signature Version 4.,Build the canonical query string by starting with the first parameter name in the sorted list.
CanonicalRequest =
HTTPRequestMethod + '\n' +
CanonicalURI + '\n' +
CanonicalQueryString + '\n' +
CanonicalHeaders + '\n' +
SignedHeaders + '\n' +
HexEncode(Hash(RequestPayload))
GET https: //iam.amazonaws.com/?Action=ListUsers&Version=2010-05-08 HTTP/1.1
Host: iam.amazonaws.com
Content - Type: application / x - www - form - urlencoded;
charset = utf - 8
X - Amz - Date: 20150830 T123600Z
GET
/documents%2520and%2520settings/
/
Action = ListUsers & Version = 2010 - 05 - 08
Calling bytes is only recommended under Unix. Under Windows, the unicode form is the canonical representation of filesystem paths.,Paths of a different flavour compare unequal and cannot be ordered:,A subclass of PurePath, this path flavour represents non-Windows filesystem paths:,A subclass of Path and PurePosixPath, this class represents concrete non-Windows filesystem paths:
>>> from pathlib
import Path
>>> p = Path('.') >>>
[x
for x in p.iterdir() if x.is_dir()
]
[PosixPath('.hg'), PosixPath('docs'), PosixPath('dist'),
PosixPath('__pycache__'), PosixPath('build')
]
>>> list(p.glob('**/*.py'))[PosixPath('test_pathlib.py'), PosixPath('setup.py'),
PosixPath('pathlib.py'), PosixPath('docs/conf.py'),
PosixPath('build/lib/pathlib.py')]
>>> p = Path('/etc') >>>
q = p / 'init.d' / 'reboot' >>>
q
PosixPath('/etc/init.d/reboot') >>>
q.resolve()
PosixPath('/etc/rc.d/init.d/halt')
>>> q.exists()
True
>>>
q.is_dir()
False
>>> with q.open() as f: f.readline()
...
'#!/bin/bash\n'
March 25, 2022
import advertools as adv
import pandas as pd
from lxml import etree
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
Millions of URLs can be taken into a Pandas data frame with Advertools, as shown below.
sitemap_url = "https://www.complaintsboard.com/sitemap.xml"
sitemap = adv.sitemap_to_df(sitemap_url)
sitemap.to_csv("sitemap.csv")
sitemap_df = pd.read_csv("sitemap.csv", index_col = False)
sitemap_df.drop(columns = ["Unnamed: 0"], inplace = True)
sitemap_df
To understand which tags are used or not within the Sitemap XML file, use the function below.
def check_sitemap_tag_usage(sitemap):
lastmod = sitemap["lastmod"].isna().value_counts()
priority = sitemap["priority"].isna().value_counts()
changefreq = sitemap["changefreq"].isna().value_counts()
lastmod_perc = sitemap["lastmod"].isna().value_counts(normalize = True) * 100
priority_perc = sitemap["priority"].isna().value_counts(normalize = True) * 100
changefreq_perc = sitemap["changefreq"].isna().value_counts(normalize = True) * 100
sitemap_tag_usage_df = pd.DataFrame(data = {
"lastmod": lastmod,
"priority": priority,
"changefreq": changefreq,
"lastmod_perc": lastmod_perc,
"priority_perc": priority_perc,
"changefreq_perc": changefreq_perc
})
return sitemap_tag_usage_df.astype(int)
Use the following code block to check the HTTP Usage ratio for the URLs within the Sitemap.
sitemap_url_df["scheme"].value_counts().to_frame()
To see whether there is a robots.txt file of the website, use the code block below.
import requests
r = requests.get("https://www.complaintsboard.com/robots.txt")
R.status_code
200