This commit is contained in:
Askill 2023-07-24 23:12:15 +02:00
commit cc6a203def
11 changed files with 372040 additions and 0 deletions

19
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,19 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"env": {
"GEVENT_SUPPORT": "True"
},
}
]
}

368749
csv_file2.csv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

11
go.mod Normal file
View File

@ -0,0 +1,11 @@
module askill/buzzword-graph
go 1.20
require (
github.com/awalterschulze/gographviz v0.0.0-20200901124122-0eecad45bd71 // indirect
github.com/blushft/go-diagrams v0.0.0-20201006005127-c78c821223d9 // indirect
github.com/dominikbraun/graph v0.23.0 // indirect
github.com/google/uuid v1.1.2 // indirect
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 // indirect
)

66
go.sum Normal file
View File

@ -0,0 +1,66 @@
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/UnnoTed/fileb0x v1.1.4/go.mod h1:X59xXT18tdNk/D6j+KZySratBsuKJauMtVuJ9cgOiZs=
github.com/awalterschulze/gographviz v0.0.0-20200901124122-0eecad45bd71 h1:m3N1Fv5vE5IcxuTOGFGGV0grrVFHV8UY2SV0wSBXAC8=
github.com/awalterschulze/gographviz v0.0.0-20200901124122-0eecad45bd71/go.mod h1:/ynarkO/43wP/JM2Okn61e8WFMtdbtA8he7GJxW+SFM=
github.com/blushft/go-diagrams v0.0.0-20201006005127-c78c821223d9 h1:mV+hh0rMjzrhg7Jc/GKwpa+y/0BMHGOHdM9yY1GYyFI=
github.com/blushft/go-diagrams v0.0.0-20201006005127-c78c821223d9/go.mod h1:nDeXEIaeDV+mAK1gBD3/RJH67DYPC0GdaznWN7sB07s=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/dave/jennifer v1.4.1/go.mod h1:7jEdnm+qBcxl8PC0zyp7vxcpSRnzXSt9r39tpTVGlwA=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dominikbraun/graph v0.23.0 h1:TdZB4pPqCLFxYhdyMFb1TBdFxp8XLcJfTTBQucVPgCo=
github.com/dominikbraun/graph v0.23.0/go.mod h1:yOjYyogZLY1LSG9E33JWZJiq5k83Qy2C6POAuiViluc=
github.com/gizak/termui/v3 v3.1.0/go.mod h1:bXQEBkJpzxUAKf0+xq9MSWAvWZlE7c+aidmyFlkYTrY=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/iancoleman/strcase v0.1.1/go.mod h1:SK73tn/9oHe+/Y0h39VT4UCxmurVJkR5NA7kMEAOgSE=
github.com/karrick/godirwalk v1.7.8/go.mod h1:2c9FRhkDxdIbgkOnCEvnSWs71Bhugbl46shStcFDJ34=
github.com/labstack/echo v3.2.1+incompatible/go.mod h1:0INS7j/VjnFxD4E2wkz67b8cVwCLbBmJyDaka6Cmk1s=
github.com/labstack/gommon v0.2.7/go.mod h1:/tj9csK2iPSBvn+3NLM9e52usepMtrd5ilFYA+wQNJ4=
github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU=
github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4=
github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mattn/go-runewidth v0.0.3/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo=
github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo=
github.com/nsf/termbox-go v0.0.0-20190121233118-02980233997d/go.mod h1:IuKpRQcYE1Tfu+oAQqaLisqDeXgjyyltCfsaoYN18NQ=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasttemplate v0.0.0-20170224212429-dcecefd839c4/go.mod h1:50wTf68f99/Zt14pr046Tgt3Lp2vLyFZKzbFXTOabXw=
golang.org/x/crypto v0.0.0-20180910181607-0e37d006457b/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20200908183739-ae8ad444f925/go.mod h1:1phAWC201xIgDyaFpmDeZkgf70Q4Pd/CNqfRtVPtxNw=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180921000356-2f5d2388922f/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20181019160139-8e24a49d80f8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

188
main.go Normal file
View File

@ -0,0 +1,188 @@
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"strings"
"github.com/dominikbraun/graph"
"github.com/dominikbraun/graph/draw"
)
type Edge struct {
name string
weight float64
target *Node
}
type Node struct {
edges map[string]Edge
name string
}
func makeNode(name string) *Node {
node := &Node{name: name, edges: make(map[string]Edge)}
return node
}
type Thesaurus struct {
pntrmap map[string]*Node
}
func (thes *Thesaurus) addEntry(start string, targets []string) {
_, exists := thes.pntrmap[start]
if !exists {
thes.pntrmap[start] = makeNode(start)
}
thes.addEdges(start, targets)
}
func (thes *Thesaurus) addEdges(start string, targets []string) {
val := thes.pntrmap[start]
for _, s := range targets {
edgeVal, edgeExists := val.edges[s]
if edgeExists {
edgeVal.weight += 1
val.edges[s] = edgeVal
} else {
targetVal, targetExists := thes.pntrmap[s]
if !targetExists {
thes.pntrmap[s] = makeNode(s)
targetVal = thes.pntrmap[s]
}
val.edges[s] = Edge{name: s, weight: 1, target: targetVal}
}
}
}
func readCsvFile(filePath string) [][]string {
f, err := os.Open(filePath)
if err != nil {
log.Fatal("Unable to read input file "+filePath, err)
}
defer f.Close()
csvReader := csv.NewReader(f)
records, err := csvReader.ReadAll()
if err != nil {
log.Fatal("Unable to parse file as CSV for "+filePath, err)
}
return records
}
func test() {
thes := Thesaurus{pntrmap: make(map[string]*Node)}
thes.addEntry("node1", []string{"node2"})
thes.addEntry("node2", []string{"node2", "node1"})
thes.addEntry("node3", []string{"node1"})
thes.addEntry("node3", []string{"node1"})
for i, s := range thes.pntrmap {
fmt.Println(i, s.edges)
}
}
func printThesaurus(thes Thesaurus) {
for i, s := range thes.pntrmap {
fmt.Println(i, s.edges)
fmt.Println("####################################################################")
}
}
func trim(word string) string {
word1 := strings.Replace(word, "\"", "", -1)
word1 = strings.Replace(word1, "'", "", -1)
word1 = strings.Replace(word1, ".", "", -1)
word1 = strings.Replace(word1, "!", "", -1)
word1 = strings.Replace(word1, "?", "", -1)
word1 = strings.Replace(word1, ":", " ", -1)
word1 = strings.Replace(word1, "#", " ", -1)
word1 = strings.Replace(word1, ",", " ", -1)
word1 = strings.Replace(word1, "(", " ", -1)
word1 = strings.Replace(word1, ")", " ", -1)
word1 = strings.Replace(word1, "”", " ", -1)
word1 = strings.Replace(word1, "“", " ", -1)
word1 = strings.Replace(word1, " ", "", -1)
return word1
}
func contains(slc *[]string, str string) bool {
for _, x := range *slc {
if x == str {
return true
}
}
return false
}
func drawNode(x graph.Graph[string, string], node *Node, drawn *[]string, limit int) graph.Graph[string, string] {
if limit <= 0 || len(*drawn) > 100 {
return x
}
//fmt.Println(limit)
y := x
//fmt.Println(node.name)
if !contains(drawn, node.name) {
_ = x.AddVertex(node.name, graph.VertexAttribute("label", node.name))
*drawn = append(*drawn, node.name)
}
edgesCounter := 0
for _, e := range node.edges {
if e.weight < 0.001 {
continue
}
edgesCounter++
if edgesCounter >= 10 {
break
}
edgeIsDrawn := contains(drawn, e.target.name)
if !edgeIsDrawn {
_ = x.AddVertex(e.target.name, graph.VertexAttribute("label", e.target.name))
*drawn = append(*drawn, e.target.name)
}
_ = x.AddEdge(node.name, e.target.name)
y = drawNode(y, e.target, drawn, limit-1)
}
return y
}
func main() {
thes := Thesaurus{pntrmap: make(map[string]*Node)}
records := readCsvFile("./csv_file2.csv")
for _, record := range records {
title := record[2]
words := strings.Split(title, " ")
for i := 0; i < len(words)-1; i++ {
thes.addEntry(trim(words[i]), []string{trim(words[i+1])})
}
}
for _, node := range thes.pntrmap {
sum := 0.0
for _, edge := range node.edges {
sum += edge.weight
}
for _, edge := range node.edges {
edge.weight = edge.weight / sum
//fmt.Println(edge.weight, node.name, edge.name)
}
}
g := graph.New(graph.StringHash, graph.Directed())
drawn := []string{}
//ctr := 0
//for _, node := range thes.pntrmap {
// if ctr >= 4 {
// break
// }
// ctr++
//
// fmt.Println(node.name)
// g = drawNode(g, node, &drawn, 4)
//}
g = drawNode(g, thes.pntrmap["10"], &drawn, 13)
file, _ := os.Create("my-graph.gv")
_ = draw.DOT(g, file)
}

732
my-graph.gv Normal file
View File

@ -0,0 +1,732 @@
strict digraph {
"Next" [ label="Next", weight=0 ];
"Theyve" [ label="Theyve", weight=0 ];
"Lesbian" [ label="Lesbian", weight=0 ];
"They" [ label="They", weight=0 ];
"They" -> "Respond" [ weight=0 ];
"They" -> "Destroyed" [ weight=0 ];
"They" -> "Hate" [ weight=0 ];
"They" -> "Die" [ weight=0 ];
"They" -> "Unveiled" [ weight=0 ];
"They" -> "Prefaced" [ weight=0 ];
"They" -> "Misspell" [ weight=0 ];
"They" -> "Wanted" [ weight=0 ];
"They" -> "Developed" [ weight=0 ];
"Week" [ label="Week", weight=0 ];
"Cat" [ label="Cat", weight=0 ];
"Grayson" [ label="Grayson", weight=0 ];
"Emoji" [ label="Emoji", weight=0 ];
"Fast" [ label="Fast", weight=0 ];
"William" [ label="William", weight=0 ];
"Tudyks" [ label="Tudyks", weight=0 ];
"Tudyks" -> "New" [ weight=0 ];
"Food" [ label="Food", weight=0 ];
"Friends" [ label="Friends", weight=0 ];
"Sexually" [ label="Sexually", weight=0 ];
"Turns" [ label="Turns", weight=0 ];
"Have" [ label="Have", weight=0 ];
"Have" -> "Walked" [ weight=0 ];
"Have" -> "Thrown" [ weight=0 ];
"Have" -> "WhatsApp" [ weight=0 ];
"Have" -> "Conflicting" [ weight=0 ];
"Have" -> "Awful" [ weight=0 ];
"Have" -> "Debuted" [ weight=0 ];
"Have" -> "Ended" [ weight=0 ];
"Have" -> "Worked" [ weight=0 ];
"Have" -> "Increased" [ weight=0 ];
"Among" [ label="Among", weight=0 ];
"Princess" [ label="Princess", weight=0 ];
"Banning" [ label="Banning", weight=0 ];
"Attacks" [ label="Attacks", weight=0 ];
"Size-Inclusive" [ label="Size-Inclusive", weight=0 ];
"Destroyed" [ label="Destroyed", weight=0 ];
"Obama" [ label="Obama", weight=0 ];
"Confirms" [ label="Confirms", weight=0 ];
"Featuring" [ label="Featuring", weight=0 ];
"Walked" [ label="Walked", weight=0 ];
"SAT" [ label="SAT", weight=0 ];
"Somehow" [ label="Somehow", weight=0 ];
"Insider" [ label="Insider", weight=0 ];
"Unveiled" [ label="Unveiled", weight=0 ];
"In" [ label="In", weight=0 ];
"Attack" [ label="Attack", weight=0 ];
"Curb" [ label="Curb", weight=0 ];
"Increased" [ label="Increased", weight=0 ];
"Crowds" [ label="Crowds", weight=0 ];
"Bits" [ label="Bits", weight=0 ];
"Cumming" [ label="Cumming", weight=0 ];
"Something" [ label="Something", weight=0 ];
"Bollywood" [ label="Bollywood", weight=0 ];
"Suge" [ label="Suge", weight=0 ];
"Penn" [ label="Penn", weight=0 ];
"At" [ label="At", weight=0 ];
"At" -> "Stopping" [ weight=0 ];
"At" -> "Spilt" [ weight=0 ];
"At" -> "Penn" [ weight=0 ];
"At" -> "Me" [ weight=0 ];
"At" -> "Banning" [ weight=0 ];
"At" -> "Princess" [ weight=0 ];
"At" -> "More" [ weight=0 ];
"At" -> "Bermuda" [ weight=0 ];
"At" -> "William" [ weight=0 ];
"From" [ label="From", weight=0 ];
"Bagels" [ label="Bagels", weight=0 ];
"Instead" [ label="Instead", weight=0 ];
"Left" [ label="Left", weight=0 ];
"2021" [ label="2021", weight=0 ];
"Driving" [ label="Driving", weight=0 ];
"Collected" [ label="Collected", weight=0 ];
"Arrest" [ label="Arrest", weight=0 ];
"2024" [ label="2024", weight=0 ];
"Song" [ label="Song", weight=0 ];
"10" [ label="10", weight=0 ];
"10" -> "Relationship" [ weight=0 ];
"10" -> "Serial" [ weight=0 ];
"10" -> "%" [ weight=0 ];
"10" -> "Badass" [ weight=0 ];
"10" -> "Taylor" [ weight=0 ];
"10" -> "K-Dramas" [ weight=0 ];
"10" -> "Insider" [ weight=0 ];
"10" -> "Albums" [ weight=0 ];
"10" -> "en" [ weight=0 ];
"Die" [ label="Die", weight=0 ];
"Gas" [ label="Gas", weight=0 ];
"And" [ label="And", weight=0 ];
"Needs" [ label="Needs", weight=0 ];
"Asylum" [ label="Asylum", weight=0 ];
"Bride" [ label="Bride", weight=0 ];
"Me" [ label="Me", weight=0 ];
"Claim" [ label="Claim", weight=0 ];
"Claim" -> "We" [ weight=0 ];
"Claim" -> "Gets" [ weight=0 ];
"Claim" -> "Asylum" [ weight=0 ];
"Claim" -> "Father" [ weight=0 ];
"Claim" -> "About" [ weight=0 ];
"Claim" -> "A" [ weight=0 ];
"Claim" -> "Obama" [ weight=0 ];
"Claim" -> "Their" [ weight=0 ];
"Claim" -> "Jeremy" [ weight=0 ];
"Crosshairs" [ label="Crosshairs", weight=0 ];
"2017" [ label="2017", weight=0 ];
"Jury" [ label="Jury", weight=0 ];
"K-Dramas" [ label="K-Dramas", weight=0 ];
"100K" [ label="100K", weight=0 ];
"Wanted" [ label="Wanted", weight=0 ];
"Wanted" -> "From" [ weight=0 ];
"Wanted" -> "Volunteers" [ weight=0 ];
"Wanted" -> "Featuring" [ weight=0 ];
"Wanted" -> "Poster" [ weight=0 ];
"Wanted" -> "A" [ weight=0 ];
"Wanted" -> "Over" [ weight=0 ];
"Wanted" -> "Police" [ weight=0 ];
"Wanted" -> "Tough-On-Crime" [ weight=0 ];
"Wanted" -> "Something" [ weight=0 ];
"Broke" [ label="Broke", weight=0 ];
"Facebook" [ label="Facebook", weight=0 ];
"About" [ label="About", weight=0 ];
"After" [ label="After", weight=0 ];
"After" -> "Raping" [ weight=0 ];
"After" -> "Bollywood" [ weight=0 ];
"After" -> "State" [ weight=0 ];
"After" -> "Suge" [ weight=0 ];
"After" -> "Spraining" [ weight=0 ];
"After" -> "Louisville" [ weight=0 ];
"After" -> "Recent" [ weight=0 ];
"After" -> "Driving" [ weight=0 ];
"After" -> "VFX" [ weight=0 ];
"Ended" [ label="Ended", weight=0 ];
"More" [ label="More", weight=0 ];
"Their" [ label="Their", weight=0 ];
"Survived" [ label="Survived", weight=0 ];
"Antifreeze" [ label="Antifreeze", weight=0 ];
"Last" [ label="Last", weight=0 ];
"Pugs" [ label="Pugs", weight=0 ];
"Hate" [ label="Hate", weight=0 ];
"Suffragists" [ label="Suffragists", weight=0 ];
"New" [ label="New", weight=0 ];
"New" -> "Kraft" [ weight=0 ];
"New" -> "Words" [ weight=0 ];
"New" -> "Lesbian" [ weight=0 ];
"New" -> "Ugly" [ weight=0 ];
"New" -> "SAT" [ weight=0 ];
"New" -> "Fast" [ weight=0 ];
"New" -> "Size-Inclusive" [ weight=0 ];
"New" -> "Trucks" [ weight=0 ];
"New" -> "Emoji" [ weight=0 ];
"Jones" [ label="Jones", weight=0 ];
"Orphan" [ label="Orphan", weight=0 ];
"Defenses" [ label="Defenses", weight=0 ];
"Defenses" -> "At" [ weight=0 ];
"Defenses" -> "After" [ weight=0 ];
"We" [ label="We", weight=0 ];
"A" [ label="A", weight=0 ];
"Yentob" [ label="Yentob", weight=0 ];
"Kurdis" [ label="Kurdis", weight=0 ];
"Vandals" [ label="Vandals", weight=0 ];
"Vandals" -> "Claim" [ weight=0 ];
"Vandals" -> "Spray" [ weight=0 ];
"Vandals" -> "Who" [ weight=0 ];
"Vandals" -> "Have" [ weight=0 ];
"Vandals" -> "Targeted" [ weight=0 ];
"Arent" [ label="Arent", weight=0 ];
"Guide" [ label="Guide", weight=0 ];
"Thrown" [ label="Thrown", weight=0 ];
"Over" [ label="Over", weight=0 ];
"Over" -> "State" [ weight=0 ];
"Over" -> "Christian" [ weight=0 ];
"Over" -> "Next" [ weight=0 ];
"Over" -> "Jury" [ weight=0 ];
"Over" -> "100K" [ weight=0 ];
"Over" -> "$110000" [ weight=0 ];
"Over" -> "Alan" [ weight=0 ];
"Over" -> "Bagels" [ weight=0 ];
"Over" -> "To" [ weight=0 ];
"%" [ label="%", weight=0 ];
"Stopping" [ label="Stopping", weight=0 ];
"Taylor" [ label="Taylor", weight=0 ];
"Worked" [ label="Worked", weight=0 ];
"Heart" [ label="Heart", weight=0 ];
"For" [ label="For", weight=0 ];
"Jeremy" [ label="Jeremy", weight=0 ];
"Trial" [ label="Trial", weight=0 ];
"Ad" [ label="Ad", weight=0 ];
"Quit" [ label="Quit", weight=0 ];
"Spray" [ label="Spray", weight=0 ];
"Spray" -> "Needs" [ weight=0 ];
"Spray" -> "With" [ weight=0 ];
"Spray" -> "Face" [ weight=0 ];
"Spray" -> "Instead" [ weight=0 ];
"Spray" -> "As" [ weight=0 ];
"Spray" -> "And" [ weight=0 ];
"Spray" -> "Against" [ weight=0 ];
"Spray" -> "At" [ weight=0 ];
"Spray" -> "From" [ weight=0 ];
"Until" [ label="Until", weight=0 ];
"Until" -> "January" [ weight=0 ];
"Until" -> "2018" [ weight=0 ];
"Until" -> "2024" [ weight=0 ];
"Until" -> "Last" [ weight=0 ];
"Until" -> "2017" [ weight=0 ];
"Until" -> "Trial" [ weight=0 ];
"Until" -> "COVID-19" [ weight=0 ];
"Until" -> "2021" [ weight=0 ];
"Until" -> "Youve" [ weight=0 ];
"COVID-19" [ label="COVID-19", weight=0 ];
"Words" [ label="Words", weight=0 ];
"State" [ label="State", weight=0 ];
"Republican" [ label="Republican", weight=0 ];
"Stewart" [ label="Stewart", weight=0 ];
"Her" [ label="Her", weight=0 ];
"Tough-On-Crime" [ label="Tough-On-Crime", weight=0 ];
"Jews" [ label="Jews", weight=0 ];
"Command" [ label="Command", weight=0 ];
"Conflicting" [ label="Conflicting", weight=0 ];
"Rekers" [ label="Rekers", weight=0 ];
"Poster" [ label="Poster", weight=0 ];
"Actors" [ label="Actors", weight=0 ];
"$110000" [ label="$110000", weight=0 ];
"Photoshop" [ label="Photoshop", weight=0 ];
"en" [ label="en", weight=0 ];
"Theyll" [ label="Theyll", weight=0 ];
"Benefits" [ label="Benefits", weight=0 ];
"Christian" [ label="Christian", weight=0 ];
"This" [ label="This", weight=0 ];
"Erica" [ label="Erica", weight=0 ];
"Debuted" [ label="Debuted", weight=0 ];
"Ugly" [ label="Ugly", weight=0 ];
"Ugly" -> "Heart" [ weight=0 ];
"Ugly" -> "People" [ weight=0 ];
"Ugly" -> "Among" [ weight=0 ];
"Ugly" -> "Friends" [ weight=0 ];
"Ugly" -> "Bits" [ weight=0 ];
"Ugly" -> "Food" [ weight=0 ];
"Ugly" -> "And" [ weight=0 ];
"Ugly" -> "Xmas" [ weight=0 ];
"Ugly" -> "This" [ weight=0 ];
"January" [ label="January", weight=0 ];
"Bermuda" [ label="Bermuda", weight=0 ];
"Gymnast" [ label="Gymnast", weight=0 ];
"Navigate" [ label="Navigate", weight=0 ];
"Alan" [ label="Alan", weight=0 ];
"Alan" -> "Tudyks" [ weight=0 ];
"Alan" -> "Yentob" [ weight=0 ];
"Alan" -> "Kurdis" [ weight=0 ];
"Alan" -> "Rekers" [ weight=0 ];
"Alan" -> "Kims" [ weight=0 ];
"Alan" -> "Jones" [ weight=0 ];
"Alan" -> "Grayson" [ weight=0 ];
"Alan" -> "Menken" [ weight=0 ];
"Alan" -> "Cumming" [ weight=0 ];
"Respond" [ label="Respond", weight=0 ];
"WhatsApp" [ label="WhatsApp", weight=0 ];
"Police" [ label="Police", weight=0 ];
"Prefaced" [ label="Prefaced", weight=0 ];
"Cyber" [ label="Cyber", weight=0 ];
"Cyber" -> "Stalker" [ weight=0 ];
"Cyber" -> "Conferences" [ weight=0 ];
"Cyber" -> "Command" [ weight=0 ];
"Cyber" -> "Vandals" [ weight=0 ];
"Cyber" -> "Savings" [ weight=0 ];
"Cyber" -> "Week" [ weight=0 ];
"Cyber" -> "Guide" [ weight=0 ];
"Cyber" -> "Attacks" [ weight=0 ];
"Cyber" -> "Defenses" [ weight=0 ];
"Realize" [ label="Realize", weight=0 ];
"Recent" [ label="Recent", weight=0 ];
"Misspell" [ label="Misspell", weight=0 ];
"Developed" [ label="Developed", weight=0 ];
"Against" [ label="Against", weight=0 ];
"Trucks" [ label="Trucks", weight=0 ];
"Kraft" [ label="Kraft", weight=0 ];
"Youve" [ label="Youve", weight=0 ];
"Gets" [ label="Gets", weight=0 ];
"Targeted" [ label="Targeted", weight=0 ];
"Targeted" -> "For" [ weight=0 ];
"Targeted" -> "In" [ weight=0 ];
"Targeted" -> "Crowds" [ weight=0 ];
"Targeted" -> "Attack" [ weight=0 ];
"Targeted" -> "Her" [ weight=0 ];
"Targeted" -> "Facebook" [ weight=0 ];
"Targeted" -> "Republican" [ weight=0 ];
"Targeted" -> "Him" [ weight=0 ];
"Targeted" -> "Jews" [ weight=0 ];
"Serial" [ label="Serial", weight=0 ];
"Albums" [ label="Albums", weight=0 ];
"Conferences" [ label="Conferences", weight=0 ];
"Conferences" -> "Theyll" [ weight=0 ];
"Conferences" -> "Curb" [ weight=0 ];
"Conferences" -> "At" [ weight=0 ];
"Conferences" -> "Theyve" [ weight=0 ];
"Conferences" -> "Until" [ weight=0 ];
"Conferences" -> "With" [ weight=0 ];
"Conferences" -> "After" [ weight=0 ];
"Louisville" [ label="Louisville", weight=0 ];
"Punched" [ label="Punched", weight=0 ];
"Kims" [ label="Kims", weight=0 ];
"Stalker" [ label="Stalker", weight=0 ];
"Menken" [ label="Menken", weight=0 ];
"Flag-Burning" [ label="Flag-Burning", weight=0 ];
"Relationship" [ label="Relationship", weight=0 ];
"Raping" [ label="Raping", weight=0 ];
"2018" [ label="2018", weight=0 ];
"To" [ label="To", weight=0 ];
"Face" [ label="Face", weight=0 ];
"Who" [ label="Who", weight=0 ];
"Who" -> "Quit" [ weight=0 ];
"Who" -> "Collected" [ weight=0 ];
"Who" -> "Photoshop" [ weight=0 ];
"Who" -> "Broke" [ weight=0 ];
"Who" -> "Survived" [ weight=0 ];
"Who" -> "Benefits" [ weight=0 ];
"Who" -> "Punched" [ weight=0 ];
"Who" -> "Somehow" [ weight=0 ];
"Who" -> "Arent" [ weight=0 ];
"Volunteers" [ label="Volunteers", weight=0 ];
"Savings" [ label="Savings", weight=0 ];
"VFX" [ label="VFX", weight=0 ];
"People" [ label="People", weight=0 ];
"People" -> "At" [ weight=0 ];
"People" -> "Realize" [ weight=0 ];
"People" -> "Cyber" [ weight=0 ];
"People" -> "Ad" [ weight=0 ];
"People" -> "Confirms" [ weight=0 ];
"People" -> "Sexually" [ weight=0 ];
"People" -> "Handled" [ weight=0 ];
"People" -> "Navigate" [ weight=0 ];
"People" -> "Left" [ weight=0 ];
"Father" [ label="Father", weight=0 ];
"24" [ label="24", weight=0 ];
"Xmas" [ label="Xmas", weight=0 ];
"Badass" [ label="Badass", weight=0 ];
"Badass" -> "Song" [ weight=0 ];
"Badass" -> "Suffragists" [ weight=0 ];
"Badass" -> "Gymnast" [ weight=0 ];
"Badass" -> "Bride" [ weight=0 ];
"Badass" -> "Erica" [ weight=0 ];
"Badass" -> "Cat" [ weight=0 ];
"Badass" -> "They" [ weight=0 ];
"Badass" -> "Turns" [ weight=0 ];
"Badass" -> "Pugs" [ weight=0 ];
"Awful" [ label="Awful", weight=0 ];
"Him" [ label="Him", weight=0 ];
"As" [ label="As", weight=0 ];
"Handled" [ label="Handled", weight=0 ];
"With" [ label="With", weight=0 ];
"With" -> "Arrest" [ weight=0 ];
"With" -> "Antifreeze" [ weight=0 ];
"With" -> "Crosshairs" [ weight=0 ];
"With" -> "Gas" [ weight=0 ];
"With" -> "Flag-Burning" [ weight=0 ];
"With" -> "24" [ weight=0 ];
"With" -> "Stewart" [ weight=0 ];
"With" -> "Actors" [ weight=0 ];
"With" -> "Orphan" [ weight=0 ];
"Spraining" [ label="Spraining", weight=0 ];
"Spilt" [ label="Spilt", weight=0 ];
}

2196
my-graph.gv.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 117 KiB

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
lxml
requests
aiohttp

76
scrape.py Normal file
View File

@ -0,0 +1,76 @@
import asyncio
import csv
import aiohttp
from lxml import html
header_values = {
'name': 'Michael Foord',
'location': 'Northampton',
'language': 'English',
'User-Agent': 'Mozilla 4/0',
'Accept-Encoding': 'gzip',
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
'Upgrade-Insecure-Requests': '0',
'Referrer': 'https://www.google.com/'
}
def get_links():
links = []
base = "https://www.buzzfeed.com/archive/" # + y/m/d
start = 2006
end = 2023
for year in range(start, end+1):
for month in range(1, 13):
for day in range(1, 32):
links.append(base + f"{year}/{month}/{day}")
return links
async def get_content_from_link(session, link):
def texts_from_html_elements(elements):
return [x.strip() for x in elements]
try:
async with session.get(link) as response:
tree = html.fromstring(await response.text())
title_path = '//div[2]/div/h2/a/text()[normalize-space()]'
link_path = '//div[2]/div/h2/a/@href'
desc_path = '//div[2]/div/p/text()[normalize-space()]'
author_path = '//div[3]/div/div/a/span/text()[normalize-space()]'
titles = tree.xpath(title_path)
links = tree.xpath(link_path)
descs = texts_from_html_elements(tree.xpath(desc_path))
authors = texts_from_html_elements(tree.xpath(author_path))
link_comp = link.split("/")
date = link_comp[-3] + "/" + link_comp[-2] + "/" + link_comp[-1]
print(date)
return list(zip([date]*len(titles), range(0, len(titles)), titles, links, descs, authors))
except:
print("unable to get ", link.split(".com"[-1]))
return []
async def get_content_from_links(links):
contents = []
async with aiohttp.ClientSession() as session:
contents = await asyncio.gather(*[get_content_from_link(session, link) for link in links])
if contents is not None:
return [item for row in contents for item in row]
else:
return []
def main():
links = get_links()
x = asyncio.get_event_loop().run_until_complete(get_content_from_links(links))
with open('./csv_file.csv', 'w', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["date", "index", "titles", "links", "descs", "authors"])
writer.writerows(x)
print(x)
if __name__ == "__main__":
main()