1242. Web Crawler Multithreaded
given URL startUrl и интерфейс HtmlParser, реализуйте многопоточный веб-краулер, который будет просматривать все ссылки, находящиеся под тем же именем хоста, что и startUrl. return все URL, полученные вашим веб-краулером, в любом порядке.
Ваш краулер должен: Начинать со страницы: startUrl Вызывать HtmlParser.getUrls(url), чтобы получить все URL с веб-страницы данного URL. Не просматривать одну и ту же ссылку дважды. Исследовать только те ссылки, которые находятся под тем же именем хоста, что и startUrl.
Esempio:
Input:
urls = [
"http://news.yahoo.com",
"http://news.yahoo.com/news",
"http://news.yahoo.com/news/topics/",
"http://news.google.com",
"http://news.yahoo.com/us"
]
edges = [[2,0],[2,1],[3,2],[3,1],[0,4]]
startUrl = "http://news.yahoo.com/news/topics/"
Output: [
"http://news.yahoo.com",
"http://news.yahoo.com/news",
"http://news.yahoo.com/news/topics/",
"http://news.yahoo.com/us"
]
C# soluzione
abbinato/originaleusing System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Net;
using System.Threading.Tasks;
public class HtmlParser {
public List<string> GetUrls(string url) {
return new List<string>();
}
}
public class Solution {
private ConcurrentDictionary<string, bool> visited = new ConcurrentDictionary<string, bool>();
private string hostname;
private HtmlParser htmlParser;
public IList<string> Crawl(string startUrl, HtmlParser htmlParser) {
this.hostname = new Uri(startUrl).Host;
this.htmlParser = htmlParser;
var tasks = new List<Task>();
visited.TryAdd(startUrl, true);
tasks.Add(Task.Run(() => Visit(startUrl)));
Task.WaitAll(tasks.ToArray());
return visited.Keys.ToList();
}
private async Task Visit(string url) {
foreach (var nextUrl in htmlParser.GetUrls(url)) {
if (new Uri(nextUrl).Host == hostname && visited.TryAdd(nextUrl, true)) {
await Visit(nextUrl);
}
}
}
}
C++ soluzione
bozza automatica, rivedere prima dell'invio#include <bits/stdc++.h>
using namespace std;
// Auto-generated C++ draft from the C# solution. Review containers, LINQ and helper types before submit.
public class HtmlParser {
public List<string> GetUrls(string url) {
return new List<string>();
}
}
class Solution {
public:
private ConcurrentDictionary<string, bool> visited = new ConcurrentDictionary<string, bool>();
private string hostname;
private HtmlParser htmlParser;
public vector<string> Crawl(string startUrl, HtmlParser htmlParser) {
this.hostname = new Uri(startUrl).Host;
this.htmlParser = htmlParser;
var tasks = new List<Task>();
visited.TryAdd(startUrl, true);
tasks.push_back(Task.Run(() => Visit(startUrl)));
Task.WaitAll(tasks.ToArray());
return visited.Keys.ToList();
}
private async Task Visit(string url) {
foreach (var nextUrl in htmlParser.GetUrls(url)) {
if (new Uri(nextUrl).Host == hostname && visited.TryAdd(nextUrl, true)) {
await Visit(nextUrl);
}
}
}
}
Java soluzione
abbinato/originaleimport java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
class HtmlParser {
public List<String> getUrls(String url) {
return new ArrayList<>();
}
}
public class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String hostname = extractHostname(startUrl);
Set<String> visited = ConcurrentHashMap.newKeySet();
ExecutorService executor = Executors.newFixedThreadPool(10);
Queue<Future<?>> futures = new ConcurrentLinkedQueue<>();
visited.add(startUrl);
futures.add(executor.submit(() -> visit(startUrl, htmlParser, hostname, visited, futures, executor)));
while (!futures.isEmpty()) {
try {
futures.poll().get();
} catch (InterruptedException | ExecutionException e) {
e.printStackTrace();
}
}
executor.shutdown();
return new ArrayList<>(visited);
}
private void visit(String url, HtmlParser htmlParser, String hostname, Set<String> visited, Queue<Future<?>> futures, ExecutorService executor) {
for (String nextUrl : htmlParser.getUrls(url)) {
if (extractHostname(nextUrl).equals(hostname) && visited.add(nextUrl)) {
futures.add(executor.submit(() -> visit(nextUrl, htmlParser, hostname, visited, futures, executor)));
}
}
}
private String extractHostname(String url) {
try {
URL u = new URL(url);
return u.getHost();
} catch (MalformedURLException e) {
e.printStackTrace();
return "";
}
}
}
JavaScript soluzione
abbinato/originaleclass HtmlParser {
getUrls(url) {
return [];
}
}
const extractHostname = (url) => {
const { hostname } = new URL(url);
return hostname;
};
const crawl = async (startUrl, htmlParser) => {
const hostname = extractHostname(startUrl);
const visited = new Set([startUrl]);
const queue = [startUrl];
while (queue.length) {
const url = queue.shift();
const urls = await htmlParser.getUrls(url);
for (const nextUrl of urls) {
if (extractHostname(nextUrl) === hostname && !visited.has(nextUrl)) {
visited.add(nextUrl);
queue.push(nextUrl);
}
}
}
return Array.from(visited);
};
Python soluzione
abbinato/originalefrom concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import threading
class HtmlParser:
def getUrls(self, url: str) -> [str]:
pass
def extract_hostname(url):
parsed_url = urlparse(url)
return parsed_url.netloc
def crawl(startUrl, htmlParser):
hostname = extract_hostname(startUrl)
visited = set()
lock = threading.Lock()
def visit(url):
if extract_hostname(url) == hostname:
with lock:
if url not in visited:
visited.add(url)
for next_url in htmlParser.getUrls(url):
executor.submit(visit, next_url)
with ThreadPoolExecutor(max_workers=10) as executor:
executor.submit(visit, startUrl)
return list(visited)
Go soluzione
abbinato/originalepackage main
import (
"net/url"
"sync"
)
type HtmlParser struct{}
func (p *HtmlParser) GetUrls(url string) []string {
return []string{}
}
func extractHostname(rawurl string) string {
parsedUrl, _ := url.Parse(rawurl)
return parsedUrl.Host
}
func crawl(startUrl string, htmlParser HtmlParser) []string {
hostname := extractHostname(startUrl)
visited := sync.Map{}
var wg sync.WaitGroup
queue := make(chan string, 100)
visited.Store(startUrl, true)
queue <- startUrl
worker := func() {
for url := range queue {
for _, nextUrl := range htmlParser.GetUrls(url) {
if extractHostname(nextUrl) == hostname {
if _, loaded := visited.LoadOrStore(nextUrl, true); !loaded {
wg.Add(1)
queue <- nextUrl
}
}
}
wg.Done()
}
}
for i := 0; i < 10; i++ {
go worker()
}
wg.Add(1)
wg.Wait()
close(queue)
var result []string
visited.Range(func(key, value interface{}) bool {
result = append(result, key.(string))
return true
})
return result
}
func main() {}
Algorithm
Извлечь имя хоста из startUrl.
Использовать многопоточность для обработки URL-адресов.
Хранить посещенные URL-адреса, чтобы избежать повторного посещения.
Использовать HtmlParser для получения URL-адресов с веб-страниц.
😎
Vacancies for this task
offerte attive with overlapping task tags are mostrati.