From d6d44df3321a27be9c4e317066476f3390085aec Mon Sep 17 00:00:00 2001 From: m bobak Date: Fri, 14 Oct 2022 07:49:24 -0700 Subject: [PATCH 1/5] vs using sitemap.xml fork branch --- app/controllers/Application.scala | 46 +++++++++++++++++++++++++++++++ conf/routes | 6 ++++ conf/sitemap.xml | 4 +++ 3 files changed, 56 insertions(+) create mode 100644 conf/sitemap.xml diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala index 344474f35..c3dab2d6e 100644 --- a/app/controllers/Application.scala +++ b/app/controllers/Application.scala @@ -4,6 +4,7 @@ import models.{Event, UUID, UserStatus} import play.api.Play.current import play.api.mvc.Action import play.api.{Logger, Play, Routes} +import play.api.libs.json._ import services._ import util.Formatters.sanitizeHTML @@ -84,6 +85,51 @@ class Application @Inject()(files: FileService, collections: CollectionService, } } + + /** + * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts + * suggested to start like w/swagger route, but if I don't cache it, then I should change this + * otherwise it will need a filler file there; which I should provide as a cache + */ + def sitemap = Action { implicit request => + Play.resource("/public/sitemap.xml") match { //in case we cache it here someday + case Some(resource) => { + val https = Utils.https(request) + val clowderurl = new URL(Utils.baseUrl(request)) + val host = if (clowderurl.getPort == -1) { + clowderurl.getHost + } else { + clowderurl.getHost + ":" + clowderurl.getPort + } + var resultStr="" + val top= """ + """ + resultStr = resultStr.concat(top) + //though had called the route2get but couldn't change datastruct + val d = scala.io.Source.fromURL(clowderurl + "/api/datasets") + val sd = d.mkString + val parsedJson = Json.parse(sd) + val idl = (parsedJson \\ "id") + val id1=idl(1) + var uStr = "" + idl.foreach( id => { + val id_ = id.as[String] + uStr = "\n" + clowderurl + "/datasets/" + id_ + "" + resultStr = resultStr.concat(uStr) + }) + resultStr = resultStr + "\n" + //could cache, in case we want to reuse later, w/Ok(reult.mkString) + //_would again check cache before creating, but still problems w/: + //BufferedWriter writer = new BufferedWriter(new FileWriter(resource)); + //writer.write(resultStr); writer.close(); //getting errors again w/this + //val resultStr = "ret string vs file" + Ok(resultStr.mkString) + } + case None => NotFound("Could not find sitemap.xml") + } + } + + /** * Main page. */ diff --git a/conf/routes b/conf/routes index 0eb5f62b6..4cd844301 100644 --- a/conf/routes +++ b/conf/routes @@ -297,6 +297,12 @@ GET /javascriptRoutes # ---------------------------------------------------------------------- GET /swagger @controllers.Application.swagger GET /swaggerUI @controllers.Application.swaggerUI + +# ---------------------------------------------------------------------- +# SITEMAP +# ---------------------------------------------------------------------- +GET /sitemap.xml @controllers.Application.sitemap +GET /sitemap @controllers.Application.sitemap # ---------------------------------------------------------------------- # RESTful API diff --git a/conf/sitemap.xml b/conf/sitemap.xml new file mode 100644 index 000000000..a9a734872 --- /dev/null +++ b/conf/sitemap.xml @@ -0,0 +1,4 @@ +=placeholder right now: +Route setup to read from this cached file, so expects it + even though the caching hasn't been done yet + and right now it is returning it directly From 44a57e106e5480890aa08bbc47190eff93cc8269 Mon Sep 17 00:00:00 2001 From: m bobak Date: Fri, 14 Oct 2022 07:51:49 -0700 Subject: [PATCH 2/5] inital sitemap route --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85bd19a86..0507df0b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ script prior to upgrading to minimize the downtime. - In the docker container the folder /home/clowder/data is now whitelisted by default for uploading by reference. This can be changed using the environment variable CLOWDER_SOURCEPATH. - The current CLA for developers of clowder. +- sitemap.xml route to list dataset pages so they can be crawled for thier embedded jsonld, for google dataset search ### Fixed - Send email to all admins in a single email when a user submits 'Request access' for a space From 735b0b1b8ea5189dde677c3dab9005a7ecb2a49d Mon Sep 17 00:00:00 2001 From: mike bobak Date: Fri, 21 Oct 2022 10:50:30 -0500 Subject: [PATCH 3/5] Update Application.scala getDatasets for User.anonymous for now, vs using the route can look at possible caching and changing list limit in next iteration --- app/controllers/Application.scala | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala index c3dab2d6e..fe20f34b3 100644 --- a/app/controllers/Application.scala +++ b/app/controllers/Application.scala @@ -1,13 +1,13 @@ package controllers -import models.{Event, UUID, UserStatus} +import models.{Event, UUID, UserStatus, User} import play.api.Play.current import play.api.mvc.Action import play.api.{Logger, Play, Routes} import play.api.libs.json._ import services._ import util.Formatters.sanitizeHTML - +import api.Permission.Permission import java.net.URL import javax.inject.{Inject, Singleton} import scala.collection.immutable.List @@ -89,7 +89,7 @@ class Application @Inject()(files: FileService, collections: CollectionService, /** * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts * suggested to start like w/swagger route, but if I don't cache it, then I should change this - * otherwise it will need a filler file there; which I should provide as a cache + * otherwise it will need a filler file there; which I provide till used as a cache */ def sitemap = Action { implicit request => Play.resource("/public/sitemap.xml") match { //in case we cache it here someday @@ -101,28 +101,20 @@ class Application @Inject()(files: FileService, collections: CollectionService, } else { clowderurl.getHost + ":" + clowderurl.getPort } + val user = User.anonymous + val dd = tree.getDatasets(false,user) var resultStr="" val top= """ """ resultStr = resultStr.concat(top) - //though had called the route2get but couldn't change datastruct - val d = scala.io.Source.fromURL(clowderurl + "/api/datasets") - val sd = d.mkString - val parsedJson = Json.parse(sd) - val idl = (parsedJson \\ "id") - val id1=idl(1) var uStr = "" - idl.foreach( id => { - val id_ = id.as[String] - uStr = "\n" + clowderurl + "/datasets/" + id_ + "" + dd.foreach( dd_ => { + val dd_id = (dd_ \ "id").as[String] + uStr = "\n" + clowderurl + "/datasets/" + dd_id + "" resultStr = resultStr.concat(uStr) }) resultStr = resultStr + "\n" - //could cache, in case we want to reuse later, w/Ok(reult.mkString) - //_would again check cache before creating, but still problems w/: - //BufferedWriter writer = new BufferedWriter(new FileWriter(resource)); - //writer.write(resultStr); writer.close(); //getting errors again w/this - //val resultStr = "ret string vs file" + //could still cache and read when nothing new but less likely if have to recheck permissions as well Ok(resultStr.mkString) } case None => NotFound("Could not find sitemap.xml") From bd9e344274f8346e7882cb0ba496cdaf7847c3e5 Mon Sep 17 00:00:00 2001 From: mike bobak Date: Thu, 27 Oct 2022 16:35:01 -0500 Subject: [PATCH 4/5] Update Application.scala from remote sitemap.xml branch --- app/controllers/Application.scala | 34 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala index fe20f34b3..b61c0cb57 100644 --- a/app/controllers/Application.scala +++ b/app/controllers/Application.scala @@ -1,13 +1,12 @@ package controllers -import models.{Event, UUID, UserStatus, User} +import models.{Event, UUID, UserStatus} import play.api.Play.current import play.api.mvc.Action import play.api.{Logger, Play, Routes} -import play.api.libs.json._ import services._ import util.Formatters.sanitizeHTML -import api.Permission.Permission + import java.net.URL import javax.inject.{Inject, Singleton} import scala.collection.immutable.List @@ -19,7 +18,8 @@ import scala.collection.mutable.ListBuffer @Singleton class Application @Inject()(files: FileService, collections: CollectionService, datasets: DatasetService, spaces: SpaceService, events: EventService, comments: CommentService, - sections: SectionService, users: UserService, selections: SelectionService) extends SecuredController { + sections: SectionService, users: UserService, selections: SelectionService, + tree: TreeService) extends SecuredController { /** * Redirect any url's that have a trailing / * @@ -85,12 +85,15 @@ class Application @Inject()(files: FileService, collections: CollectionService, } } - /** * Returns the sitemap.xml for the datasets to be scraped for their jsonld scripts * suggested to start like w/swagger route, but if I don't cache it, then I should change this - * otherwise it will need a filler file there; which I provide till used as a cache + * otherwise it will need a filler file there; which I should provide as a cache */ +import play.api.libs.json._ //put at top +import api.Permission.Permission //put at top +import models.User + def sitemap = Action { implicit request => Play.resource("/public/sitemap.xml") match { //in case we cache it here someday case Some(resource) => { @@ -101,7 +104,8 @@ class Application @Inject()(files: FileService, collections: CollectionService, } else { clowderurl.getHost + ":" + clowderurl.getPort } - val user = User.anonymous + val user = User.anonymous //not found: value User + //val dd=tree.getDatasets(true,user) //not owned by anon val dd = tree.getDatasets(false,user) var resultStr="" val top= """ @@ -113,15 +117,27 @@ class Application @Inject()(files: FileService, collections: CollectionService, uStr = "\n" + clowderurl + "/datasets/" + dd_id + "" resultStr = resultStr.concat(uStr) }) + //was from route + //val d = scala.io.Source.fromURL(clowderurl + "/api/datasets") + //val sd = d.mkString + //val parsedJson = Json.parse(sd) + //val idl = (parsedJson \\ "id") + //idl.foreach( id => { + // val id_ = id.as[String] + // uStr = "\n" + clowderurl + "/datasets/" + id_ + "" + // resultStr = resultStr.concat(uStr) + //}) + //will rm above once getstatsets resultStr = resultStr + "\n" - //could still cache and read when nothing new but less likely if have to recheck permissions as well + //could cache, in case we want to reuse later, w/Ok(reult.mkString) + //_would again check cache before creating, but still problems w/: + //might skip as would have to recheck permissions as well Ok(resultStr.mkString) } case None => NotFound("Could not find sitemap.xml") } } - /** * Main page. */ From fc1b0e91a915a9eb9b4af77676d96f1752e9ffe9 Mon Sep 17 00:00:00 2001 From: mike bobak Date: Thu, 27 Oct 2022 16:35:06 -0500 Subject: [PATCH 5/5] Create sitemap.xml from remote sitemap.xml branch --- public/sitemap.xml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 public/sitemap.xml diff --git a/public/sitemap.xml b/public/sitemap.xml new file mode 100644 index 000000000..a57eac995 --- /dev/null +++ b/public/sitemap.xml @@ -0,0 +1,6 @@ +filler that will be replaced with cached sitemap +though that idea might be on hold if we have to + worry about who has access to this sitemap + as clowder v1 only has public and hidden +while v2 might get a private setting where + you can see that it is there but not download it w/o auth